Use bing to power the amazon metadata plugin

Faster and more reliable than wayback machine
2025-07-09 03:04:10 -04:00 · 2017-03-02 16:55:32 +05:30 · 2017-03-02 16:55:32 +05:30 · 5c9c40431f
commit 5c9c40431f
parent 287fa950a1
2 changed files with 79 additions and 11 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -28,7 +28,7 @@ class SearchFailed(ValueError):
 ua_index = -1
-USE_SEARCH_ENGINE = False
+USE_SEARCH_ENGINE = True
 def parse_details_page(url, log, timeout, browser, domain):
@ -1218,9 +1218,10 @@ class Amazon(Source):
                                          identifiers=identifiers, for_amazon=False)
        site = self.referrer_for_domain(
            domain)[len('https://'):].partition('/')[0]
        se = search_engines_module()
        matches = []
-        for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
+        se = search_engines_module()
        cover_url_prefix = 'bing'
        for result in se.bing_search(terms, site, log=log, br=br, timeout=timeout):
            if abort.is_set():
                return matches, terms, domain, None
@ -1241,7 +1242,7 @@ class Amazon(Source):
                log('Skipping non-book result:', result)
        if not matches:
            log('No search engine results for terms:', ' '.join(terms))
-        return matches, terms, domain, se.wayback_url_processor
+        return matches, terms, domain, lambda x: (cover_url_prefix + ':' + x)
    # }}}
    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
@ -1261,7 +1262,7 @@ class Amazon(Source):
        if udata is not None and not USE_SEARCH_ENGINE:
            # Try to directly get details page instead of running a search
            # Cannot use search engine as the directly constructed URL is
-            # usually redirected to a full URL by amazon, which is therefore
+            # usually redirected to a full URL by amazon, and is therefore
            # not cached
            domain, idtype, asin, durl = udata
            if durl is not None:
@ -1353,10 +1354,16 @@ class Amazon(Source):
        if abort.is_set():
            return
        log('Downloading cover from:', cached_url)
        br = self.browser
        se = search_engines_module()
        url = se.resolve_url(cached_url)
        if USE_SEARCH_ENGINE:
            br = br.clone_browser()
            br.set_current_header('Referer', self.referrer_for_domain(self.domain))
        try:
            time.sleep(1)
-            cdata = self.browser.open_novisit(
+            cdata = br.open_novisit(
-                cached_url, timeout=timeout).read()
+                url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -14,9 +14,9 @@ from urlparse import parse_qs
 from lxml import etree
 import html5lib
-from calibre import browser as _browser, prints
+from calibre import browser as _browser, prints, random_user_agent
 from calibre.utils.monotonic import monotonic
-from calibre.utils.random_ua import random_user_agent, accept_header_for_ua
+from calibre.utils.random_ua import accept_header_for_ua
 current_version = (1, 0, 0)
 minimum_calibre_version = (2, 80, 0)
@ -27,7 +27,7 @@ Result = namedtuple('Result', 'url title cached_url')
 def browser():
-    ua = random_user_agent()
+    ua = random_user_agent(allow_ie=False)
    br = _browser(user_agent=ua)
    br.set_handle_gzip(True)
    br.addheaders += [
@ -64,11 +64,13 @@ def quote_term(x):
    return quote_plus(x.encode('utf-8')).decode('utf-8')
 # DDG + Wayback machine {{{
 def ddg_term(t):
    t = t.replace('"', '')
    if t.lower() in {'map', 'news'}:
        t = '"' + t + '"'
-    if t in {'OR', 'AND'}:
+    if t in {'OR', 'AND', 'NOT'}:
        t = t.lower()
    return t
@ -128,3 +130,62 @@ def ddg_develop():
            print(' ', result.url)
            print(' ', wayback_machine_cached_url(result.url, br))
            print()
 # }}}
 # Bing {{{
 def bing_term(t):
    t = t.replace('"', '')
    if t in {'OR', 'AND', 'NOT'}:
        t = t.lower()
    return t
 def bing_url_processor(url):
    return url
 def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
    # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
    terms = map(bing_term, terms)
    terms = [quote_term(t) for t in terms]
    if site is not None:
        terms.append(quote_term(('site:' + site)))
    q = '+'.join(terms)
    url = 'https://www.bing.com/search?q={q}'.format(q=q)
    log('Making bing query: ' + url)
    br = br or browser()
    root = query(br, url, 'bing', dump_raw, timeout=timeout)
    ans = []
    for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
        a = li.xpath('descendant::h2/a[@href]')[0]
        div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
        d, w = div.get('u').split('|')[-2:]
        # The bing cache does not have a valid https certificate currently
        # (March 2017)
        cached_url = 'http://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
            q=q, d=d, w=w)
        ans.append(Result(ddg_href(a.get('href')), etree.tostring(
            a, encoding=unicode, method='text', with_tail=False), cached_url))
    return ans
 def bing_develop():
    br = browser()
    for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br):
        if '/dp/' in result.url:
            print(result.title)
            print(' ', result.url)
            print(' ', result.cached_url)
            print()
 # }}}
 def resolve_url(url):
    prefix, rest = url.partition(':')[::2]
    if prefix == 'bing':
        return bing_url_processor(rest)
    if prefix == 'wayback':
        return wayback_url_processor(rest)
    return url