mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use bing to power the amazon metadata plugin
Faster and more reliable than wayback machine
This commit is contained in:
parent
287fa950a1
commit
5c9c40431f
@ -28,7 +28,7 @@ class SearchFailed(ValueError):
|
|||||||
|
|
||||||
|
|
||||||
ua_index = -1
|
ua_index = -1
|
||||||
USE_SEARCH_ENGINE = False
|
USE_SEARCH_ENGINE = True
|
||||||
|
|
||||||
|
|
||||||
def parse_details_page(url, log, timeout, browser, domain):
|
def parse_details_page(url, log, timeout, browser, domain):
|
||||||
@ -1218,9 +1218,10 @@ class Amazon(Source):
|
|||||||
identifiers=identifiers, for_amazon=False)
|
identifiers=identifiers, for_amazon=False)
|
||||||
site = self.referrer_for_domain(
|
site = self.referrer_for_domain(
|
||||||
domain)[len('https://'):].partition('/')[0]
|
domain)[len('https://'):].partition('/')[0]
|
||||||
se = search_engines_module()
|
|
||||||
matches = []
|
matches = []
|
||||||
for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
|
se = search_engines_module()
|
||||||
|
cover_url_prefix = 'bing'
|
||||||
|
for result in se.bing_search(terms, site, log=log, br=br, timeout=timeout):
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
return matches, terms, domain, None
|
return matches, terms, domain, None
|
||||||
|
|
||||||
@ -1241,7 +1242,7 @@ class Amazon(Source):
|
|||||||
log('Skipping non-book result:', result)
|
log('Skipping non-book result:', result)
|
||||||
if not matches:
|
if not matches:
|
||||||
log('No search engine results for terms:', ' '.join(terms))
|
log('No search engine results for terms:', ' '.join(terms))
|
||||||
return matches, terms, domain, se.wayback_url_processor
|
return matches, terms, domain, lambda x: (cover_url_prefix + ':' + x)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
@ -1261,7 +1262,7 @@ class Amazon(Source):
|
|||||||
if udata is not None and not USE_SEARCH_ENGINE:
|
if udata is not None and not USE_SEARCH_ENGINE:
|
||||||
# Try to directly get details page instead of running a search
|
# Try to directly get details page instead of running a search
|
||||||
# Cannot use search engine as the directly constructed URL is
|
# Cannot use search engine as the directly constructed URL is
|
||||||
# usually redirected to a full URL by amazon, which is therefore
|
# usually redirected to a full URL by amazon, and is therefore
|
||||||
# not cached
|
# not cached
|
||||||
domain, idtype, asin, durl = udata
|
domain, idtype, asin, durl = udata
|
||||||
if durl is not None:
|
if durl is not None:
|
||||||
@ -1353,10 +1354,16 @@ class Amazon(Source):
|
|||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
return
|
return
|
||||||
log('Downloading cover from:', cached_url)
|
log('Downloading cover from:', cached_url)
|
||||||
|
br = self.browser
|
||||||
|
se = search_engines_module()
|
||||||
|
url = se.resolve_url(cached_url)
|
||||||
|
if USE_SEARCH_ENGINE:
|
||||||
|
br = br.clone_browser()
|
||||||
|
br.set_current_header('Referer', self.referrer_for_domain(self.domain))
|
||||||
try:
|
try:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
cdata = self.browser.open_novisit(
|
cdata = br.open_novisit(
|
||||||
cached_url, timeout=timeout).read()
|
url, timeout=timeout).read()
|
||||||
result_queue.put((self, cdata))
|
result_queue.put((self, cdata))
|
||||||
except:
|
except:
|
||||||
log.exception('Failed to download cover from:', cached_url)
|
log.exception('Failed to download cover from:', cached_url)
|
||||||
|
@ -14,9 +14,9 @@ from urlparse import parse_qs
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
from calibre import browser as _browser, prints
|
from calibre import browser as _browser, prints, random_user_agent
|
||||||
from calibre.utils.monotonic import monotonic
|
from calibre.utils.monotonic import monotonic
|
||||||
from calibre.utils.random_ua import random_user_agent, accept_header_for_ua
|
from calibre.utils.random_ua import accept_header_for_ua
|
||||||
|
|
||||||
current_version = (1, 0, 0)
|
current_version = (1, 0, 0)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
@ -27,7 +27,7 @@ Result = namedtuple('Result', 'url title cached_url')
|
|||||||
|
|
||||||
|
|
||||||
def browser():
|
def browser():
|
||||||
ua = random_user_agent()
|
ua = random_user_agent(allow_ie=False)
|
||||||
br = _browser(user_agent=ua)
|
br = _browser(user_agent=ua)
|
||||||
br.set_handle_gzip(True)
|
br.set_handle_gzip(True)
|
||||||
br.addheaders += [
|
br.addheaders += [
|
||||||
@ -64,11 +64,13 @@ def quote_term(x):
|
|||||||
return quote_plus(x.encode('utf-8')).decode('utf-8')
|
return quote_plus(x.encode('utf-8')).decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
# DDG + Wayback machine {{{
|
||||||
|
|
||||||
def ddg_term(t):
|
def ddg_term(t):
|
||||||
t = t.replace('"', '')
|
t = t.replace('"', '')
|
||||||
if t.lower() in {'map', 'news'}:
|
if t.lower() in {'map', 'news'}:
|
||||||
t = '"' + t + '"'
|
t = '"' + t + '"'
|
||||||
if t in {'OR', 'AND'}:
|
if t in {'OR', 'AND', 'NOT'}:
|
||||||
t = t.lower()
|
t = t.lower()
|
||||||
return t
|
return t
|
||||||
|
|
||||||
@ -128,3 +130,62 @@ def ddg_develop():
|
|||||||
print(' ', result.url)
|
print(' ', result.url)
|
||||||
print(' ', wayback_machine_cached_url(result.url, br))
|
print(' ', wayback_machine_cached_url(result.url, br))
|
||||||
print()
|
print()
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
# Bing {{{
|
||||||
|
|
||||||
|
|
||||||
|
def bing_term(t):
|
||||||
|
t = t.replace('"', '')
|
||||||
|
if t in {'OR', 'AND', 'NOT'}:
|
||||||
|
t = t.lower()
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def bing_url_processor(url):
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
|
||||||
|
# http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
|
||||||
|
terms = map(bing_term, terms)
|
||||||
|
terms = [quote_term(t) for t in terms]
|
||||||
|
if site is not None:
|
||||||
|
terms.append(quote_term(('site:' + site)))
|
||||||
|
q = '+'.join(terms)
|
||||||
|
url = 'https://www.bing.com/search?q={q}'.format(q=q)
|
||||||
|
log('Making bing query: ' + url)
|
||||||
|
br = br or browser()
|
||||||
|
root = query(br, url, 'bing', dump_raw, timeout=timeout)
|
||||||
|
ans = []
|
||||||
|
for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
|
||||||
|
a = li.xpath('descendant::h2/a[@href]')[0]
|
||||||
|
div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
|
||||||
|
d, w = div.get('u').split('|')[-2:]
|
||||||
|
# The bing cache does not have a valid https certificate currently
|
||||||
|
# (March 2017)
|
||||||
|
cached_url = 'http://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
|
||||||
|
q=q, d=d, w=w)
|
||||||
|
ans.append(Result(ddg_href(a.get('href')), etree.tostring(
|
||||||
|
a, encoding=unicode, method='text', with_tail=False), cached_url))
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def bing_develop():
|
||||||
|
br = browser()
|
||||||
|
for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br):
|
||||||
|
if '/dp/' in result.url:
|
||||||
|
print(result.title)
|
||||||
|
print(' ', result.url)
|
||||||
|
print(' ', result.cached_url)
|
||||||
|
print()
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_url(url):
|
||||||
|
prefix, rest = url.partition(':')[::2]
|
||||||
|
if prefix == 'bing':
|
||||||
|
return bing_url_processor(rest)
|
||||||
|
if prefix == 'wayback':
|
||||||
|
return wayback_url_processor(rest)
|
||||||
|
return url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user