From 5c9c40431fd5cc6c29b98b9515ac11ce71f58cb5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 2 Mar 2017 16:55:32 +0530
Subject: [PATCH] Use bing to power the amazon metadata plugin

Faster and more reliable than wayback machine
---
 src/calibre/ebooks/metadata/sources/amazon.py | 21 ++++--
 .../ebooks/metadata/sources/search_engines.py | 69 +++++++++++++++++--
 2 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index 5cc586670a..3432e9b004 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -28,7 +28,7 @@ class SearchFailed(ValueError):
 
 
 ua_index = -1
-USE_SEARCH_ENGINE = False
+USE_SEARCH_ENGINE = True
 
 
 def parse_details_page(url, log, timeout, browser, domain):
@@ -1218,9 +1218,10 @@ class Amazon(Source):
                                           identifiers=identifiers, for_amazon=False)
         site = self.referrer_for_domain(
             domain)[len('https://'):].partition('/')[0]
-        se = search_engines_module()
         matches = []
-        for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
+        se = search_engines_module()
+        cover_url_prefix = 'bing'
+        for result in se.bing_search(terms, site, log=log, br=br, timeout=timeout):
             if abort.is_set():
                 return matches, terms, domain, None
 
@@ -1241,7 +1242,7 @@ class Amazon(Source):
                 log('Skipping non-book result:', result)
         if not matches:
             log('No search engine results for terms:', ' '.join(terms))
-        return matches, terms, domain, se.wayback_url_processor
+        return matches, terms, domain, lambda x: (cover_url_prefix + ':' + x)
     # }}}
 
     def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
@@ -1261,7 +1262,7 @@ class Amazon(Source):
         if udata is not None and not USE_SEARCH_ENGINE:
             # Try to directly get details page instead of running a search
             # Cannot use search engine as the directly constructed URL is
-            # usually redirected to a full URL by amazon, which is therefore
+            # usually redirected to a full URL by amazon, and is therefore
             # not cached
             domain, idtype, asin, durl = udata
             if durl is not None:
@@ -1353,10 +1354,16 @@ class Amazon(Source):
         if abort.is_set():
             return
         log('Downloading cover from:', cached_url)
+        br = self.browser
+        se = search_engines_module()
+        url = se.resolve_url(cached_url)
+        if USE_SEARCH_ENGINE:
+            br = br.clone_browser()
+            br.set_current_header('Referer', self.referrer_for_domain(self.domain))
         try:
             time.sleep(1)
-            cdata = self.browser.open_novisit(
-                cached_url, timeout=timeout).read()
+            cdata = br.open_novisit(
+                url, timeout=timeout).read()
             result_queue.put((self, cdata))
         except:
             log.exception('Failed to download cover from:', cached_url)
diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py
index 67923f3801..faf1e5f927 100644
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@@ -14,9 +14,9 @@ from urlparse import parse_qs
 from lxml import etree
 
 import html5lib
-from calibre import browser as _browser, prints
+from calibre import browser as _browser, prints, random_user_agent
 from calibre.utils.monotonic import monotonic
-from calibre.utils.random_ua import random_user_agent, accept_header_for_ua
+from calibre.utils.random_ua import accept_header_for_ua
 
 current_version = (1, 0, 0)
 minimum_calibre_version = (2, 80, 0)
@@ -27,7 +27,7 @@ Result = namedtuple('Result', 'url title cached_url')
 
 
 def browser():
-    ua = random_user_agent()
+    ua = random_user_agent(allow_ie=False)
     br = _browser(user_agent=ua)
     br.set_handle_gzip(True)
     br.addheaders += [
@@ -64,11 +64,13 @@ def quote_term(x):
     return quote_plus(x.encode('utf-8')).decode('utf-8')
 
 
+# DDG + Wayback machine {{{
+
 def ddg_term(t):
     t = t.replace('"', '')
     if t.lower() in {'map', 'news'}:
         t = '"' + t + '"'
-    if t in {'OR', 'AND'}:
+    if t in {'OR', 'AND', 'NOT'}:
         t = t.lower()
     return t
 
@@ -128,3 +130,62 @@ def ddg_develop():
             print(' ', result.url)
             print(' ', wayback_machine_cached_url(result.url, br))
             print()
+# }}}
+
+# Bing {{{
+
+
+def bing_term(t):
+    t = t.replace('"', '')
+    if t in {'OR', 'AND', 'NOT'}:
+        t = t.lower()
+    return t
+
+
+def bing_url_processor(url):
+    return url
+
+
+def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
+    # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
+    terms = map(bing_term, terms)
+    terms = [quote_term(t) for t in terms]
+    if site is not None:
+        terms.append(quote_term(('site:' + site)))
+    q = '+'.join(terms)
+    url = 'https://www.bing.com/search?q={q}'.format(q=q)
+    log('Making bing query: ' + url)
+    br = br or browser()
+    root = query(br, url, 'bing', dump_raw, timeout=timeout)
+    ans = []
+    for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
+        a = li.xpath('descendant::h2/a[@href]')[0]
+        div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
+        d, w = div.get('u').split('|')[-2:]
+        # The bing cache does not have a valid https certificate currently
+        # (March 2017)
+        cached_url = 'http://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
+            q=q, d=d, w=w)
+        ans.append(Result(ddg_href(a.get('href')), etree.tostring(
+            a, encoding=unicode, method='text', with_tail=False), cached_url))
+    return ans
+
+
+def bing_develop():
+    br = browser()
+    for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br):
+        if '/dp/' in result.url:
+            print(result.title)
+            print(' ', result.url)
+            print(' ', result.cached_url)
+            print()
+# }}}
+
+
+def resolve_url(url):
+    prefix, rest = url.partition(':')[::2]
+    if prefix == 'bing':
+        return bing_url_processor(rest)
+    if prefix == 'wayback':
+        return wayback_url_processor(rest)
+    return url