Use bing to power the amazon metadata plugin

Faster and more reliable than wayback machine
This commit is contained in:
Kovid Goyal 2017-03-02 16:55:32 +05:30
parent 287fa950a1
commit 5c9c40431f
2 changed files with 79 additions and 11 deletions

View File

@ -28,7 +28,7 @@ class SearchFailed(ValueError):
ua_index = -1 ua_index = -1
USE_SEARCH_ENGINE = False USE_SEARCH_ENGINE = True
def parse_details_page(url, log, timeout, browser, domain): def parse_details_page(url, log, timeout, browser, domain):
@ -1218,9 +1218,10 @@ class Amazon(Source):
identifiers=identifiers, for_amazon=False) identifiers=identifiers, for_amazon=False)
site = self.referrer_for_domain( site = self.referrer_for_domain(
domain)[len('https://'):].partition('/')[0] domain)[len('https://'):].partition('/')[0]
se = search_engines_module()
matches = [] matches = []
for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout): se = search_engines_module()
cover_url_prefix = 'bing'
for result in se.bing_search(terms, site, log=log, br=br, timeout=timeout):
if abort.is_set(): if abort.is_set():
return matches, terms, domain, None return matches, terms, domain, None
@ -1241,7 +1242,7 @@ class Amazon(Source):
log('Skipping non-book result:', result) log('Skipping non-book result:', result)
if not matches: if not matches:
log('No search engine results for terms:', ' '.join(terms)) log('No search engine results for terms:', ' '.join(terms))
return matches, terms, domain, se.wayback_url_processor return matches, terms, domain, lambda x: (cover_url_prefix + ':' + x)
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
@ -1261,7 +1262,7 @@ class Amazon(Source):
if udata is not None and not USE_SEARCH_ENGINE: if udata is not None and not USE_SEARCH_ENGINE:
# Try to directly get details page instead of running a search # Try to directly get details page instead of running a search
# Cannot use search engine as the directly constructed URL is # Cannot use search engine as the directly constructed URL is
# usually redirected to a full URL by amazon, which is therefore # usually redirected to a full URL by amazon, and is therefore
# not cached # not cached
domain, idtype, asin, durl = udata domain, idtype, asin, durl = udata
if durl is not None: if durl is not None:
@ -1353,10 +1354,16 @@ class Amazon(Source):
if abort.is_set(): if abort.is_set():
return return
log('Downloading cover from:', cached_url) log('Downloading cover from:', cached_url)
br = self.browser
se = search_engines_module()
url = se.resolve_url(cached_url)
if USE_SEARCH_ENGINE:
br = br.clone_browser()
br.set_current_header('Referer', self.referrer_for_domain(self.domain))
try: try:
time.sleep(1) time.sleep(1)
cdata = self.browser.open_novisit( cdata = br.open_novisit(
cached_url, timeout=timeout).read() url, timeout=timeout).read()
result_queue.put((self, cdata)) result_queue.put((self, cdata))
except: except:
log.exception('Failed to download cover from:', cached_url) log.exception('Failed to download cover from:', cached_url)

View File

@ -14,9 +14,9 @@ from urlparse import parse_qs
from lxml import etree from lxml import etree
import html5lib import html5lib
from calibre import browser as _browser, prints from calibre import browser as _browser, prints, random_user_agent
from calibre.utils.monotonic import monotonic from calibre.utils.monotonic import monotonic
from calibre.utils.random_ua import random_user_agent, accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 0, 0) current_version = (1, 0, 0)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
@ -27,7 +27,7 @@ Result = namedtuple('Result', 'url title cached_url')
def browser(): def browser():
ua = random_user_agent() ua = random_user_agent(allow_ie=False)
br = _browser(user_agent=ua) br = _browser(user_agent=ua)
br.set_handle_gzip(True) br.set_handle_gzip(True)
br.addheaders += [ br.addheaders += [
@ -64,11 +64,13 @@ def quote_term(x):
return quote_plus(x.encode('utf-8')).decode('utf-8') return quote_plus(x.encode('utf-8')).decode('utf-8')
# DDG + Wayback machine {{{
def ddg_term(t): def ddg_term(t):
t = t.replace('"', '') t = t.replace('"', '')
if t.lower() in {'map', 'news'}: if t.lower() in {'map', 'news'}:
t = '"' + t + '"' t = '"' + t + '"'
if t in {'OR', 'AND'}: if t in {'OR', 'AND', 'NOT'}:
t = t.lower() t = t.lower()
return t return t
@ -128,3 +130,62 @@ def ddg_develop():
print(' ', result.url) print(' ', result.url)
print(' ', wayback_machine_cached_url(result.url, br)) print(' ', wayback_machine_cached_url(result.url, br))
print() print()
# }}}
# Bing {{{
def bing_term(t):
t = t.replace('"', '')
if t in {'OR', 'AND', 'NOT'}:
t = t.lower()
return t
def bing_url_processor(url):
return url
def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
# http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
terms = map(bing_term, terms)
terms = [quote_term(t) for t in terms]
if site is not None:
terms.append(quote_term(('site:' + site)))
q = '+'.join(terms)
url = 'https://www.bing.com/search?q={q}'.format(q=q)
log('Making bing query: ' + url)
br = br or browser()
root = query(br, url, 'bing', dump_raw, timeout=timeout)
ans = []
for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
a = li.xpath('descendant::h2/a[@href]')[0]
div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
d, w = div.get('u').split('|')[-2:]
# The bing cache does not have a valid https certificate currently
# (March 2017)
cached_url = 'http://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
q=q, d=d, w=w)
ans.append(Result(ddg_href(a.get('href')), etree.tostring(
a, encoding=unicode, method='text', with_tail=False), cached_url))
return ans
def bing_develop():
br = browser()
for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br):
if '/dp/' in result.url:
print(result.title)
print(' ', result.url)
print(' ', result.cached_url)
print()
# }}}
def resolve_url(url):
prefix, rest = url.partition(':')[::2]
if prefix == 'bing':
return bing_url_processor(rest)
if prefix == 'wayback':
return wayback_url_processor(rest)
return url