Amazon metadata download: Support for yet another variant of amazon cover image markup

This commit is contained in:
Kovid Goyal 2015-08-26 21:08:32 +05:30
parent b91f42ac8d
commit a623357b0c

View File

@ -557,11 +557,11 @@ class Worker(Thread): # Get details {{{
def parse_cover(self, root, raw=b""): def parse_cover(self, root, raw=b""):
# Look for the image URL in javascript, using the first image in the # Look for the image URL in javascript, using the first image in the
# image gallery as the cover # image gallery as the cover
import json
imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""") imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""")
for script in root.xpath('//script'): for script in root.xpath('//script'):
m = imgpat.search(script.text or '') m = imgpat.search(script.text or '')
if m is not None: if m is not None:
import json
try: try:
return json.loads(m.group(1))[0]['mainUrl'] return json.loads(m.group(1))[0]['mainUrl']
except Exception: except Exception:
@ -590,6 +590,25 @@ class Worker(Thread): # Get details {{{
imgs = root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') imgs = root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]')
if not imgs: if not imgs:
imgs = root.xpath('//div[@id="main-image-container"]//img[@src]') imgs = root.xpath('//div[@id="main-image-container"]//img[@src]')
if not imgs:
imgs = root.xpath('//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
for img in imgs:
try:
idata = json.loads(img.get('data-a-dynamic-image'))
except Exception:
imgs = ()
else:
mwidth = 0
try:
url = None
for iurl, (width, height) in idata.iteritems():
if width > mwidth:
mwidth = width
url = iurl
return url
except Exception:
pass
for img in imgs: for img in imgs:
src = img.get('src') src = img.get('src')
if 'data:' in src: if 'data:' in src:
@ -972,7 +991,6 @@ class Amazon(Source):
if udata is not None: if udata is not None:
# Try to directly get details page instead of running a search # Try to directly get details page instead of running a search
domain, idtype, asin, durl = udata domain, idtype, asin, durl = udata
durl = 'http://www.amazon.com/gp/product/' + asin
preparsed_root = parse_details_page(durl, log, timeout, br, domain) preparsed_root = parse_details_page(durl, log, timeout, br, domain)
if preparsed_root is not None: if preparsed_root is not None:
qasin = parse_asin(preparsed_root[1], log, durl) qasin = parse_asin(preparsed_root[1], log, durl)
@ -1123,11 +1141,9 @@ if __name__ == '__main__': # tests {{{
), ),
( # + in title and uses id="main-image" for cover ( # + in title and uses id="main-image" for cover
{'title':'C++ Concurrency in Action'}, {'identifiers':{'amazon':'1933988770'}},
[title_test('C++ Concurrency in Action: Practical Multithreading', [title_test('C++ Concurrency in Action: Practical Multithreading', exact=True)]
exact=True), ),
]
),
( # noscript description ( # noscript description