diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 82c2519e29..bc88d473fd 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -350,20 +350,20 @@ def get_proxy_info(proxy_scheme, proxy_string): USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' -def random_user_agent(): +def random_user_agent(choose=None): choices = [ 'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11', + 'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0', + 'Mozilla/5.0 (Windows NT 6.2; rv:9.0.1) Gecko/20100101 Firefox/9.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.78 Safari/532.5', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', ] - #return choices[-1] - return choices[random.randint(0, len(choices)-1)] + if choose is None: + choose = random.randint(0, len(choices)-1) + return choices[choose] def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): ''' diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 3d08b96c5f..cb724765f5 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -13,7 +13,7 @@ from threading import Thread from Queue import Queue, Empty -from calibre import as_unicode +from calibre import as_unicode, random_user_agent from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase, fixauthors) @@ -174,8 +174,8 @@ class Worker(Thread): # Get details {{{ def get_details(self): from calibre.utils.cleantext import clean_ascii_chars - from calibre.utils.soupparser import fromstring from calibre.ebooks.chardet import xml_to_unicode + import html5lib try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() @@ -202,7 +202,8 @@ class Worker(Thread): # Get details {{{ return try: - root = fromstring(clean_ascii_chars(raw)) + root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', + namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) @@ -356,33 +357,46 @@ class Worker(Thread): # Get details {{{ if m is not None: return float(m.group(1))/float(m.group(3)) * 5 - def parse_comments(self, root): + def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html + for c in desc.xpath('descendant::noscript'): + c.getparent().remove(c) + for c in desc.xpath('descendant::*[@class="seeAll" or' + ' @class="emptyClear" or @id="collapsePS" or' + ' @id="expandPS"]'): + c.getparent().remove(c) + + for a in desc.xpath('descendant::a[@href]'): + del a.attrib['href'] + a.tag = 'span' + desc = self.tostring(desc, method='html', encoding=unicode).strip() + + # Encoding bug in Amazon data U+fffd (replacement char) + # in some examples it is present in place of ' + desc = desc.replace('\ufffd', "'") + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Collapse whitespace + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) + # Remove the notice about text referring to out of print editions + desc = re.sub(r'(?s)--This text ref.*?', '', desc) + # Remove comments + desc = re.sub(r'(?s)', '', desc) + return sanitize_comments_html(desc) + + + def parse_comments(self, root): + ans = '' + desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') + if desc: + ans = self._render_comments(desc[0]) + desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: - desc = desc[0] - for c in desc.xpath('descendant::*[@class="seeAll" or' - ' @class="emptyClear"]'): - c.getparent().remove(c) - for a in desc.xpath('descendant::a[@href]'): - del a.attrib['href'] - a.tag = 'span' - desc = self.tostring(desc, method='html', encoding=unicode).strip() - - # Encoding bug in Amazon data U+fffd (replacement char) - # in some examples it is present in place of ' - desc = desc.replace('\ufffd', "'") - # remove all attributes from tags - desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) - # Collapse whitespace - #desc = re.sub('\n+', '\n', desc) - #desc = re.sub(' +', ' ', desc) - # Remove the notice about text referring to out of print editions - desc = re.sub(r'(?s)--This text ref.*?', '', desc) - # Remove comments - desc = re.sub(r'(?s)', '', desc) - return sanitize_comments_html(desc) + ans += self._render_comments(desc[0]) + return ans def parse_cover(self, root): imgs = root.xpath('//img[@id="prodImage" and @src]') @@ -467,6 +481,28 @@ class Amazon(Source): Source.__init__(self, *args, **kwargs) self.set_amazon_id_touched_fields() + def test_fields(self, mi): + ''' + Return the first field from self.touched_fields that is null on the + mi object + ''' + for key in self.touched_fields: + if key.startswith('identifier:'): + key = key.partition(':')[-1] + if key == 'amazon': + if self.domain != 'com': + key += '_' + self.domain + if not mi.has_identifier(key): + return 'identifier: ' + key + elif mi.is_null(key): + return key + + @property + def user_agent(self): + # Pass in an index to random_user_agent() to test with a particular + # user agent + return random_user_agent() + def save_settings(self, *args, **kwargs): Source.save_settings(self, *args, **kwargs) self.set_amazon_id_touched_fields() @@ -507,6 +543,9 @@ class Amazon(Source): @property def domain(self): + x = getattr(self, 'testing_domain', None) + if x is not None: + return x domain = self.prefs['domain'] if domain not in self.AMAZON_DOMAINS: domain = 'com' @@ -599,16 +638,52 @@ class Amazon(Source): return url # }}} + def parse_results_page(self, root): # {{{ + from lxml.html import tostring + + matches = [] + + def title_ok(title): + title = title.lower() + for x in ('bulk pack', '[audiobook]', '[audio cd]'): + if x in title: + return False + return True + + for div in root.xpath(r'//div[starts-with(@id, "result_")]'): + for a in div.xpath(r'descendant::a[@class="title" and @href]'): + title = tostring(a, method='text', encoding=unicode) + if title_ok(title): + matches.append(a.get('href')) + break + + if not matches: + # This can happen for some user agents that Amazon thinks are + # mobile/less capable + for td in root.xpath( + r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): + for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): + title = tostring(a, method='text', encoding=unicode) + if title_ok(title): + matches.append(a.get('href')) + break + + + # Keep only the top 5 matches as the matches are sorted by relevance by + # Amazon so lower matches are not likely to be very relevant + return matches[:5] + # }}} + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' - from lxml.html import tostring from calibre.utils.cleantext import clean_ascii_chars - from calibre.utils.soupparser import fromstring from calibre.ebooks.chardet import xml_to_unicode + from lxml.html import tostring + import html5lib query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) @@ -616,6 +691,8 @@ class Amazon(Source): log.error('Insufficient metadata to construct query') return br = self.browser + if getattr(self, 'running_a_test', False): + print ('Using user agent for amazon: %s'%self.user_agent) try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: @@ -634,15 +711,23 @@ class Amazon(Source): return as_unicode(msg) - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] + raw = clean_ascii_chars(xml_to_unicode(raw, + strip_encoding_pats=True, resolve_entities=True)[0]) + + if getattr(self, 'running_a_test', False): + import tempfile + with tempfile.NamedTemporaryFile(prefix='amazon_results_', + suffix='.html', delete=False) as f: + f.write(raw.encode('utf-8')) + print ('Downloaded html for results page saved in', f.name) matches = [] found = '404 - ' not in raw if found: try: - root = fromstring(clean_ascii_chars(raw)) + root = html5lib.parse(raw, treebuilder='lxml', + namespaceHTMLElements=False) except: msg = 'Failed to parse amazon page for query: %r'%query log.exception(msg) @@ -655,30 +740,9 @@ class Amazon(Source): # The error is almost always a not found error found = False + if found: - for div in root.xpath(r'//div[starts-with(@id, "result_")]'): - for a in div.xpath(r'descendant::a[@class="title" and @href]'): - title = tostring(a, method='text', encoding=unicode).lower() - if 'bulk pack' not in title: - matches.append(a.get('href')) - break - if not matches: - # This can happen for some user agents that Amazon thinks are - # mobile/less capable - log('Trying alternate results page markup') - for td in root.xpath( - r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): - for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): - title = tostring(a, method='text', encoding=unicode).lower() - if ('bulk pack' not in title and '[audiobook]' not in - title and '[audio cd]' not in title): - matches.append(a.get('href')) - break - - - # Keep only the top 5 matches as the matches are sorted by relevance by - # Amazon so lower matches are not likely to be very relevant - matches = matches[:5] + matches = self.parse_results_page(root) if abort.is_set(): return @@ -686,7 +750,7 @@ class Amazon(Source): if not matches: if identifiers and title and authors: log('No matches found with identifiers, retrying using only' - ' title and authors') + ' title and authors. Query: %r'%query) return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r'%query) @@ -756,9 +820,18 @@ if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e # src/calibre/ebooks/metadata/sources/amazon.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, - isbn_test, title_test, authors_test) + isbn_test, title_test, authors_test, comments_test) com_tests = [ # {{{ + ( # Different comments markup, using Book Description section + {'identifiers':{'amazon':'0982514506'}}, + [title_test( + "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy" + , exact=True), + comments_test('Jelena'), comments_test('Leslie'), + ] + ), + ( # # in title {'title':'Expert C# 2008 Business Objects', 'authors':['Lhotka']}, @@ -850,7 +923,17 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - test_identify_plugin(Amazon.name, com_tests) - #test_identify_plugin(Amazon.name, de_tests) + def do_test(domain, start=0, stop=None): + tests = globals().get(domain+'_tests') + if stop is None: + stop = len(tests) + tests = tests[start:stop] + test_identify_plugin(Amazon.name, tests, modify_plugin=lambda + p:setattr(p, 'testing_domain', domain)) + + do_test('com') + + #do_test('de') + # }}} diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 4c334f4e46..4408bff6c6 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -253,10 +253,16 @@ class Source(Plugin): # Browser {{{ + @property + def user_agent(self): + # Pass in an index to random_user_agent() to test with a particular + # user agent + return random_user_agent() + @property def browser(self): if self._browser is None: - self._browser = browser(user_agent=random_user_agent()) + self._browser = browser(user_agent=self.user_agent) if self.supports_gzip_transfer_encoding: self._browser.set_handle_gzip(True) return self._browser.clone_browser() diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index bccce3dba2..4853035b27 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -84,6 +84,16 @@ def series_test(series, series_index): return test +def comments_test(sentinel): + + def test(mi): + comm = mi.comments.lower() if mi.comments else '' + if sentinel and sentinel.lower() in comm: + return True + prints('comments test failed. %s not in comments'%sentinel) + return False + return test + def init_test(tdir_name): tdir = tempfile.gettempdir() lf = os.path.join(tdir, tdir_name.replace(' ', '')+'_identify_test.txt') @@ -157,7 +167,7 @@ def test_identify(tests): # {{{ # }}} -def test_identify_plugin(name, tests): # {{{ +def test_identify_plugin(name, tests, modify_plugin=lambda plugin:None): # {{{ ''' :param name: Plugin name :param tests: List of 2-tuples. Each two tuple is of the form (args, @@ -171,6 +181,7 @@ def test_identify_plugin(name, tests): # {{{ if x.name == name: plugin = x break + modify_plugin(plugin) prints('Testing the identify function of', plugin.name) prints('Using extra headers:', plugin.browser.addheaders) diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 5d6c43f343..0f45a47032 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -136,6 +136,7 @@ def sanitize_comments_html(html): text = html2text(html) md = markdown.Markdown(safe_mode=True) cleansed = re.sub('\n+', '', md.convert(text)) + cleansed = cleansed.replace(markdown.HTML_REMOVED_TEXT, '') return cleansed def test():