From 5c9c40431fd5cc6c29b98b9515ac11ce71f58cb5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 2 Mar 2017 16:55:32 +0530 Subject: [PATCH] Use bing to power the amazon metadata plugin Faster and more reliable than wayback machine --- src/calibre/ebooks/metadata/sources/amazon.py | 21 ++++-- .../ebooks/metadata/sources/search_engines.py | 69 +++++++++++++++++-- 2 files changed, 79 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 5cc586670a..3432e9b004 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -28,7 +28,7 @@ class SearchFailed(ValueError): ua_index = -1 -USE_SEARCH_ENGINE = False +USE_SEARCH_ENGINE = True def parse_details_page(url, log, timeout, browser, domain): @@ -1218,9 +1218,10 @@ class Amazon(Source): identifiers=identifiers, for_amazon=False) site = self.referrer_for_domain( domain)[len('https://'):].partition('/')[0] - se = search_engines_module() matches = [] - for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout): + se = search_engines_module() + cover_url_prefix = 'bing' + for result in se.bing_search(terms, site, log=log, br=br, timeout=timeout): if abort.is_set(): return matches, terms, domain, None @@ -1241,7 +1242,7 @@ class Amazon(Source): log('Skipping non-book result:', result) if not matches: log('No search engine results for terms:', ' '.join(terms)) - return matches, terms, domain, se.wayback_url_processor + return matches, terms, domain, lambda x: (cover_url_prefix + ':' + x) # }}} def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ @@ -1261,7 +1262,7 @@ class Amazon(Source): if udata is not None and not USE_SEARCH_ENGINE: # Try to directly get details page instead of running a search # Cannot use search engine as the directly constructed URL is - # usually redirected to a full URL by amazon, which is therefore + # usually redirected to a full URL by amazon, and is therefore # not cached domain, idtype, asin, durl = udata if durl is not None: @@ -1353,10 +1354,16 @@ class Amazon(Source): if abort.is_set(): return log('Downloading cover from:', cached_url) + br = self.browser + se = search_engines_module() + url = se.resolve_url(cached_url) + if USE_SEARCH_ENGINE: + br = br.clone_browser() + br.set_current_header('Referer', self.referrer_for_domain(self.domain)) try: time.sleep(1) - cdata = self.browser.open_novisit( - cached_url, timeout=timeout).read() + cdata = br.open_novisit( + url, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index 67923f3801..faf1e5f927 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -14,9 +14,9 @@ from urlparse import parse_qs from lxml import etree import html5lib -from calibre import browser as _browser, prints +from calibre import browser as _browser, prints, random_user_agent from calibre.utils.monotonic import monotonic -from calibre.utils.random_ua import random_user_agent, accept_header_for_ua +from calibre.utils.random_ua import accept_header_for_ua current_version = (1, 0, 0) minimum_calibre_version = (2, 80, 0) @@ -27,7 +27,7 @@ Result = namedtuple('Result', 'url title cached_url') def browser(): - ua = random_user_agent() + ua = random_user_agent(allow_ie=False) br = _browser(user_agent=ua) br.set_handle_gzip(True) br.addheaders += [ @@ -64,11 +64,13 @@ def quote_term(x): return quote_plus(x.encode('utf-8')).decode('utf-8') +# DDG + Wayback machine {{{ + def ddg_term(t): t = t.replace('"', '') if t.lower() in {'map', 'news'}: t = '"' + t + '"' - if t in {'OR', 'AND'}: + if t in {'OR', 'AND', 'NOT'}: t = t.lower() return t @@ -128,3 +130,62 @@ def ddg_develop(): print(' ', result.url) print(' ', wayback_machine_cached_url(result.url, br)) print() +# }}} + +# Bing {{{ + + +def bing_term(t): + t = t.replace('"', '') + if t in {'OR', 'AND', 'NOT'}: + t = t.lower() + return t + + +def bing_url_processor(url): + return url + + +def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): + # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm + terms = map(bing_term, terms) + terms = [quote_term(t) for t in terms] + if site is not None: + terms.append(quote_term(('site:' + site))) + q = '+'.join(terms) + url = 'https://www.bing.com/search?q={q}'.format(q=q) + log('Making bing query: ' + url) + br = br or browser() + root = query(br, url, 'bing', dump_raw, timeout=timeout) + ans = [] + for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'): + a = li.xpath('descendant::h2/a[@href]')[0] + div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0] + d, w = div.get('u').split('|')[-2:] + # The bing cache does not have a valid https certificate currently + # (March 2017) + cached_url = 'http://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format( + q=q, d=d, w=w) + ans.append(Result(ddg_href(a.get('href')), etree.tostring( + a, encoding=unicode, method='text', with_tail=False), cached_url)) + return ans + + +def bing_develop(): + br = browser() + for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br): + if '/dp/' in result.url: + print(result.title) + print(' ', result.url) + print(' ', result.cached_url) + print() +# }}} + + +def resolve_url(url): + prefix, rest = url.partition(':')[::2] + if prefix == 'bing': + return bing_url_processor(rest) + if prefix == 'wayback': + return wayback_url_processor(rest) + return url