Code to search using ddg

2026-01-03 10:40:21 -05:00 · 2017-03-01 20:37:47 +05:30 · 2017-03-01 20:37:47 +05:30 · 45c0fe8c54
commit 45c0fe8c54
parent 59fdacad91
1 changed files with 80 additions and 1 deletions
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -4,7 +4,86 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

+import time
+from collections import defaultdict, namedtuple
+from future_builtins import map
+from urllib import quote_plus, urlencode
+from urlparse import parse_qs
+
+from lxml import etree
+
+import html5lib
+from calibre import browser, prints
+from calibre.utils.monotonic import monotonic
+
 current_version = (1, 0, 0)
 minimum_calibre_version = (2, 80, 0)

-# DDG: https://duckduckgo.com/html/?q={search_terms}
+
+last_visited = defaultdict(lambda: 0)
+Result = namedtuple('Result', 'url text cached_url')
+
+
+def encode_query(**query):
+    q = {k.encode('utf-8'): v.encode('utf-8') for k, v in query.iteritems()}
+    return urlencode(q).decode('utf-8')
+
+
+def parse(raw):
+    return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
+
+
+def query(br, url, key, dump_raw=None):
+    delta = monotonic() - last_visited[key]
+    if delta < 1 and delta > 0:
+        time.sleep(delta)
+    try:
+        raw = br.open_novisit(url).read()
+    finally:
+        last_visited[key] = monotonic()
+    if dump_raw is not None:
+        with open(dump_raw, 'wb') as f:
+            f.write(raw)
+    return parse(raw)
+
+
+def quote_term(x):
+    return quote_plus(x.encode('utf-8')).decode('utf-8')
+
+
+def ddg_term(t):
+    t = t.replace('"', '')
+    if t.lower() in {'map', 'news'}:
+        t = '"' + t + '"'
+    if t in {'OR', 'AND'}:
+        t = t.lower()
+    return t
+
+
+def ddg_href(url):
+    if url.startswith('/'):
+        q = url.partition('?')[2]
+        url = parse_qs(q.encode('utf-8'))['uddg'][0].decode('utf-8')
+    return url
+
+
+def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
+    # https://duck.co/help/results/syntax
+    terms = map(ddg_term, terms)
+    terms = [quote_term(t) for t in terms]
+    if site is not None:
+        terms.append(quote_term(('site:' + site)))
+    q = '+'.join(terms)
+    url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(q=q, kp=1 if safe_search else -1)
+    log('Making ddg query: ' + url)
+    br = br or browser()
+    root = query(br, url, 'ddg', dump_raw)
+    ans = []
+    for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
+        ans.append(Result(ddg_href(a.get('href')), etree.tostring(a, encoding=unicode, method='text', with_tail=False), None))
+    return ans
+
+
+def ddg_develop():
+    from pprint import pprint
+    pprint(ddg_search('heroes abercrombie'.split(), 'amazon.com', dump_raw='/t/raw.html'))