Amazon metadata download: Speedup metadata download from amazon.com when an ASIN is specified by trying the product page directly first instead of running a search. Also works around amazon server problems where it does not return Kindle Editions in search results when not logged in. Fixes #1433125 [metadata: error no matches found](https://bugs.launchpad.net/calibre/+bug/1433125)

This commit is contained in:
Kovid Goyal 2015-03-18 10:55:53 +05:30
parent 0a3b9f678f
commit 2653a0c67b

View File

@ -19,6 +19,70 @@ from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
def parse_details_page(url, log, timeout, browser, domain):
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode
import html5lib
from lxml.html import tostring
try:
raw = browser.open_novisit(url, timeout=timeout).read().strip()
except Exception as e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
log.error('URL malformed: %r'%url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Amazon timed out. Try again later.'
log.error(msg)
else:
msg = 'Failed to make details query: %r'%url
log.exception(msg)
return
oraw = raw
if 'amazon.com.br' in url:
raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
if '<title>404 - ' in raw:
log.error('URL malformed: %r'%url)
return
try:
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
namespaceHTMLElements=False)
except:
msg = 'Failed to parse amazon details page: %r'%url
log.exception(msg)
return
if domain == 'jp':
for a in root.xpath('//a[@href]'):
if 'black-curtain-redirect.html' in a.get('href'):
url = 'http://amazon.co.jp'+a.get('href')
log('Black curtain redirect found, following')
return parse_details_page(url, log, timeout, browser, domain)
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = 'Failed to parse amazon details page: %r'%url
msg += tostring(errmsg, method='text', encoding=unicode).strip()
log.error(msg)
return
from css_selectors import Select
selector = Select(root)
return oraw, root, selector
def parse_asin(root, log, url):
try:
link = root.xpath('//link[@rel="canonical" and @href]')
for l in link:
return l.get('href').rpartition('/')[-1]
except Exception:
log.exception('Error parsing ASIN for url: %r'%url)
class Worker(Thread): # Get details {{{ class Worker(Thread): # Get details {{{
''' '''
@ -26,8 +90,9 @@ class Worker(Thread): # Get details {{{
''' '''
def __init__(self, url, result_queue, browser, log, relevance, domain, def __init__(self, url, result_queue, browser, log, relevance, domain,
plugin, timeout=20, testing=False): plugin, timeout=20, testing=False, preparsed_root=None):
Thread.__init__(self) Thread.__init__(self)
self.preparsed_root = preparsed_root
self.daemon = True self.daemon = True
self.testing = testing self.testing = testing
self.url, self.result_queue = url, result_queue self.url, self.result_queue = url, result_queue
@ -213,67 +278,18 @@ class Worker(Thread): # Get details {{{
self.log.exception('get_details failed for url: %r'%self.url) self.log.exception('get_details failed for url: %r'%self.url)
def get_details(self): def get_details(self):
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode
import html5lib
try: if self.preparsed_root is None:
raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
except Exception as e: else:
if callable(getattr(e, 'getcode', None)) and \ raw, root, selector = self.preparsed_root
e.getcode() == 404:
self.log.error('URL malformed: %r'%self.url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Amazon timed out. Try again later.'
self.log.error(msg)
else:
msg = 'Failed to make details query: %r'%self.url
self.log.exception(msg)
return
oraw = raw
if 'amazon.com.br' in self.url:
raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
if '<title>404 - ' in raw:
self.log.error('URL malformed: %r'%self.url)
return
try:
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
namespaceHTMLElements=False)
except:
msg = 'Failed to parse amazon details page: %r'%self.url
self.log.exception(msg)
return
if self.domain == 'jp':
for a in root.xpath('//a[@href]'):
if 'black-curtain-redirect.html' in a.get('href'):
self.url = 'http://amazon.co.jp'+a.get('href')
self.log('Black curtain redirect found, following')
return self.get_details()
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = 'Failed to parse amazon details page: %r'%self.url
msg += self.tostring(errmsg, method='text', encoding=unicode).strip()
self.log.error(msg)
return
from css_selectors import Select from css_selectors import Select
self.selector = Select(root) self.selector = Select(root)
self.parse_details(oraw, root) self.parse_details(raw, root)
def parse_details(self, raw, root): def parse_details(self, raw, root):
try: asin = parse_asin(root, self.log, self.url)
asin = self.parse_asin(root)
except:
self.log.exception('Error parsing asin for url: %r'%self.url)
asin = None
if self.testing: if self.testing:
import tempfile, uuid import tempfile, uuid
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
@ -386,11 +402,6 @@ class Worker(Thread): # Get details {{{
self.result_queue.put(mi) self.result_queue.put(mi)
def parse_asin(self, root):
link = root.xpath('//link[@rel="canonical" and @href]')
for l in link:
return l.get('href').rpartition('/')[-1]
def totext(self, elem): def totext(self, elem):
return self.tostring(elem, encoding=unicode, method='text').strip() return self.tostring(elem, encoding=unicode, method='text').strip()
@ -934,13 +945,28 @@ class Amazon(Source):
import html5lib import html5lib
testing = getattr(self, 'running_a_test', False) testing = getattr(self, 'running_a_test', False)
br = self.browser
domain, asin = self.get_domain_and_asin(identifiers)
if asin and domain == 'com':
# Try to directly get details page instead of running a search
durl = 'http://www.amazon.com/gp/product/' + asin
preparsed_root = parse_details_page(durl, log, timeout, br, domain)
if preparsed_root is not None:
qasin = parse_asin(preparsed_root[1], log, durl)
if qasin == asin:
w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root)
try:
w.get_details()
return
except Exception:
log.exception('get_details failed for url: %r'%durl)
query, domain = self.create_query(log, title=title, authors=authors, query, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
if query is None: if query is None:
log.error('Insufficient metadata to construct query') log.error('Insufficient metadata to construct query')
return return
br = self.browser
if testing: if testing:
print ('Using user agent for amazon: %s'%self.user_agent) print ('Using user agent for amazon: %s'%self.user_agent)
try: try:
@ -1069,6 +1095,11 @@ if __name__ == '__main__': # tests {{{
isbn_test, title_test, authors_test, comments_test) isbn_test, title_test, authors_test, comments_test)
com_tests = [ # {{{ com_tests = [ # {{{
( # A kindle edition that does not appear in the search results when searching by ASIN
{'identifiers':{'amazon':'B004JHY6OG'}},
[title_test('The Heroes: A First Law Novel', exact=True)]
),
( # + in title and uses id="main-image" for cover ( # + in title and uses id="main-image" for cover
{'title':'C++ Concurrency in Action'}, {'title':'C++ Concurrency in Action'},
[title_test('C++ Concurrency in Action: Practical Multithreading', [title_test('C++ Concurrency in Action: Practical Multithreading',