diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 82c2519e29..bc88d473fd 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -350,20 +350,20 @@ def get_proxy_info(proxy_scheme, proxy_string): USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' -def random_user_agent(): +def random_user_agent(choose=None): choices = [ 'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11', + 'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0', + 'Mozilla/5.0 (Windows NT 6.2; rv:9.0.1) Gecko/20100101 Firefox/9.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.78 Safari/532.5', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', ] - #return choices[-1] - return choices[random.randint(0, len(choices)-1)] + if choose is None: + choose = random.randint(0, len(choices)-1) + return choices[choose] def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): ''' diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 3d08b96c5f..cb724765f5 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -13,7 +13,7 @@ from threading import Thread from Queue import Queue, Empty -from calibre import as_unicode +from calibre import as_unicode, random_user_agent from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase, fixauthors) @@ -174,8 +174,8 @@ class Worker(Thread): # Get details {{{ def get_details(self): from calibre.utils.cleantext import clean_ascii_chars - from calibre.utils.soupparser import fromstring from calibre.ebooks.chardet import xml_to_unicode + import html5lib try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() @@ -202,7 +202,8 @@ class Worker(Thread): # Get details {{{ return try: - root = fromstring(clean_ascii_chars(raw)) + root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', + namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) @@ -356,33 +357,46 @@ class Worker(Thread): # Get details {{{ if m is not None: return float(m.group(1))/float(m.group(3)) * 5 - def parse_comments(self, root): + def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html + for c in desc.xpath('descendant::noscript'): + c.getparent().remove(c) + for c in desc.xpath('descendant::*[@class="seeAll" or' + ' @class="emptyClear" or @id="collapsePS" or' + ' @id="expandPS"]'): + c.getparent().remove(c) + + for a in desc.xpath('descendant::a[@href]'): + del a.attrib['href'] + a.tag = 'span' + desc = self.tostring(desc, method='html', encoding=unicode).strip() + + # Encoding bug in Amazon data U+fffd (replacement char) + # in some examples it is present in place of ' + desc = desc.replace('\ufffd', "'") + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Collapse whitespace + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) + # Remove the notice about text referring to out of print editions + desc = re.sub(r'(?s)--This text ref.*?', '', desc) + # Remove comments + desc = re.sub(r'(?s)', '', desc) + return sanitize_comments_html(desc) + + + def parse_comments(self, root): + ans = '' + desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') + if desc: + ans = self._render_comments(desc[0]) + desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: - desc = desc[0] - for c in desc.xpath('descendant::*[@class="seeAll" or' - ' @class="emptyClear"]'): - c.getparent().remove(c) - for a in desc.xpath('descendant::a[@href]'): - del a.attrib['href'] - a.tag = 'span' - desc = self.tostring(desc, method='html', encoding=unicode).strip() - - # Encoding bug in Amazon data U+fffd (replacement char) - # in some examples it is present in place of ' - desc = desc.replace('\ufffd', "'") - # remove all attributes from tags - desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) - # Collapse whitespace - #desc = re.sub('\n+', '\n', desc) - #desc = re.sub(' +', ' ', desc) - # Remove the notice about text referring to out of print editions - desc = re.sub(r'(?s)--This text ref.*?', '', desc) - # Remove comments - desc = re.sub(r'(?s)', '', desc) - return sanitize_comments_html(desc) + ans += self._render_comments(desc[0]) + return ans def parse_cover(self, root): imgs = root.xpath('//img[@id="prodImage" and @src]') @@ -467,6 +481,28 @@ class Amazon(Source): Source.__init__(self, *args, **kwargs) self.set_amazon_id_touched_fields() + def test_fields(self, mi): + ''' + Return the first field from self.touched_fields that is null on the + mi object + ''' + for key in self.touched_fields: + if key.startswith('identifier:'): + key = key.partition(':')[-1] + if key == 'amazon': + if self.domain != 'com': + key += '_' + self.domain + if not mi.has_identifier(key): + return 'identifier: ' + key + elif mi.is_null(key): + return key + + @property + def user_agent(self): + # Pass in an index to random_user_agent() to test with a particular + # user agent + return random_user_agent() + def save_settings(self, *args, **kwargs): Source.save_settings(self, *args, **kwargs) self.set_amazon_id_touched_fields() @@ -507,6 +543,9 @@ class Amazon(Source): @property def domain(self): + x = getattr(self, 'testing_domain', None) + if x is not None: + return x domain = self.prefs['domain'] if domain not in self.AMAZON_DOMAINS: domain = 'com' @@ -599,16 +638,52 @@ class Amazon(Source): return url # }}} + def parse_results_page(self, root): # {{{ + from lxml.html import tostring + + matches = [] + + def title_ok(title): + title = title.lower() + for x in ('bulk pack', '[audiobook]', '[audio cd]'): + if x in title: + return False + return True + + for div in root.xpath(r'//div[starts-with(@id, "result_")]'): + for a in div.xpath(r'descendant::a[@class="title" and @href]'): + title = tostring(a, method='text', encoding=unicode) + if title_ok(title): + matches.append(a.get('href')) + break + + if not matches: + # This can happen for some user agents that Amazon thinks are + # mobile/less capable + for td in root.xpath( + r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): + for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): + title = tostring(a, method='text', encoding=unicode) + if title_ok(title): + matches.append(a.get('href')) + break + + + # Keep only the top 5 matches as the matches are sorted by relevance by + # Amazon so lower matches are not likely to be very relevant + return matches[:5] + # }}} + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' - from lxml.html import tostring from calibre.utils.cleantext import clean_ascii_chars - from calibre.utils.soupparser import fromstring from calibre.ebooks.chardet import xml_to_unicode + from lxml.html import tostring + import html5lib query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) @@ -616,6 +691,8 @@ class Amazon(Source): log.error('Insufficient metadata to construct query') return br = self.browser + if getattr(self, 'running_a_test', False): + print ('Using user agent for amazon: %s'%self.user_agent) try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: @@ -634,15 +711,23 @@ class Amazon(Source): return as_unicode(msg) - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] + raw = clean_ascii_chars(xml_to_unicode(raw, + strip_encoding_pats=True, resolve_entities=True)[0]) + + if getattr(self, 'running_a_test', False): + import tempfile + with tempfile.NamedTemporaryFile(prefix='amazon_results_', + suffix='.html', delete=False) as f: + f.write(raw.encode('utf-8')) + print ('Downloaded html for results page saved in', f.name) matches = [] found = '