Amazon metadata download: Fix for change in Amazon website that broke downloading metadata. Also make debugging the plugin a little easier. Fixes #878395 (Amazon ASIN no longer searchable in the ids field)

This commit is contained in:
Kovid Goyal 2011-10-20 08:10:16 +05:30
parent eda4c65740
commit 594bf0b678
3 changed files with 23 additions and 20 deletions

View File

@ -30,9 +30,11 @@ class Worker(Thread): # Get details {{{
Get book details from amazons book page in a separate thread
'''
def __init__(self, url, result_queue, browser, log, relevance, domain, plugin, timeout=20):
def __init__(self, url, result_queue, browser, log, relevance, domain,
plugin, timeout=20, testing=False):
Thread.__init__(self)
self.daemon = True
self.testing = testing
self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout
self.relevance, self.plugin = relevance, plugin
@ -189,10 +191,9 @@ class Worker(Thread): # Get details {{{
self.log.exception(msg)
return
oraw = raw
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
#open('/t/t.html', 'wb').write(raw)
if '<title>404 - ' in raw:
self.log.error('URL malformed: %r'%self.url)
return
@ -211,14 +212,20 @@ class Worker(Thread): # Get details {{{
self.log.error(msg)
return
self.parse_details(root)
self.parse_details(oraw, root)
def parse_details(self, root):
def parse_details(self, raw, root):
try:
asin = self.parse_asin(root)
except:
self.log.exception('Error parsing asin for url: %r'%self.url)
asin = None
if self.testing:
import tempfile
with tempfile.NamedTemporaryFile(prefix=asin + '_',
suffix='.html', delete=False) as f:
f.write(raw)
print ('Downloaded html for', asin, 'saved in', f.name)
try:
title = self.parse_title(root)
@ -310,7 +317,7 @@ class Worker(Thread): # Get details {{{
return l.get('href').rpartition('/')[-1]
def parse_title(self, root):
tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0]
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
if actual_title:
title = tostring(actual_title[0], encoding=unicode,
@ -320,11 +327,11 @@ class Worker(Thread): # Get details {{{
return re.sub(r'[(\[].*[)\]]', '', title).strip()
def parse_authors(self, root):
x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
aname = root.xpath(x)
if not aname:
aname = root.xpath('''
//h1[@class="parseasinTitle"]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
//h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
''')
for x in aname:
x.tail = ''
@ -666,7 +673,8 @@ class Amazon(Source):
log.error('No matches found with query: %r'%query)
return
workers = [Worker(url, result_queue, br, log, i, domain, self) for i, url in
workers = [Worker(url, result_queue, br, log, i, domain, self,
testing=getattr(self, 'running_a_test', False)) for i, url in
enumerate(matches)]
for w in workers:
@ -740,16 +748,6 @@ if __name__ == '__main__': # tests {{{
),
( # An e-book ISBN not on Amazon, the title/author search matches
# the Kindle edition, which has different markup for ratings and
# isbn
{'identifiers':{'isbn': '9780307459671'},
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
),
( # This isbn not on amazon
{'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
'authors':['Lutz']},

View File

@ -196,6 +196,7 @@ class Source(Plugin):
def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs)
self.running_a_test = False # Set to True when using identify_test()
self._isbn_to_identifier_cache = {}
self._identifier_to_cover_url_cache = {}
self.cache_lock = threading.RLock()

View File

@ -183,7 +183,11 @@ def test_identify_plugin(name, tests): # {{{
rq = Queue()
args = (log, rq, abort)
start_time = time.time()
plugin.running_a_test = True
try:
err = plugin.identify(*args, **kwargs)
finally:
plugin.running_a_test = False
total_time = time.time() - start_time
times.append(total_time)
if err is not None: