More amazon cover finding fixes

This commit is contained in:
Kovid Goyal 2014-11-10 10:29:10 +05:30
parent ca03e066a9
commit 03066f3283

View File

@ -541,16 +541,34 @@ class Worker(Thread): # Get details {{{
def parse_cover(self, root, raw=b""): def parse_cover(self, root, raw=b""):
# Look for the image URL in javascript, using the first image in the # Look for the image URL in javascript, using the first image in the
# image gallery as the cover # image gallery as the cover
import json
imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""") imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""")
for script in root.xpath('//script'): for script in root.xpath('//script'):
m = imgpat.search(script.text or '') m = imgpat.search(script.text or '')
if m is not None: if m is not None:
import json
try: try:
return json.loads(m.group(1))[0]['mainUrl'] return json.loads(m.group(1))[0]['mainUrl']
except Exception: except Exception:
continue continue
def clean_img_src(src):
parts = src.split('/')
if len(parts) > 3:
bn = parts[-1]
sparts = bn.split('_')
if len(sparts) > 2:
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
return ('/'.join(parts[:-1]))+'/'+bn
imgpat2 = re.compile(r'var imageSrc = "([^"]+)"')
for script in root.xpath('//script'):
m = imgpat2.search(script.text or '')
if m is not None:
src = m.group(1)
url = clean_img_src(src)
if url:
return url
imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]') imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
if not imgs: if not imgs:
imgs = root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') imgs = root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]')
@ -558,22 +576,17 @@ class Worker(Thread): # Get details {{{
imgs = root.xpath('//div[@id="main-image-container"]//img[@src]') imgs = root.xpath('//div[@id="main-image-container"]//img[@src]')
for img in imgs: for img in imgs:
src = img.get('src') src = img.get('src')
if src.startswith('data:'): if 'data:' in src:
continue continue
if 'loading-' in src: if 'loading-' in src:
js_img = re.search(br'"largeImage":"(http://[^"]+)",',raw) js_img = re.search(br'"largeImage":"(http://[^"]+)",',raw)
if js_img: if js_img:
src = js_img.group(1).decode('utf-8') src = js_img.group(1).decode('utf-8')
continue
if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src): if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):
self.log('Found image: %s' % src) self.log('Found image: %s' % src)
parts = src.split('/') url = clean_img_src(src)
if len(parts) > 3: if url:
bn = parts[-1] return url
sparts = bn.split('_')
if len(sparts) > 2:
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
return ('/'.join(parts[:-1]))+'/'+bn
def parse_new_details(self, root, mi, non_hero): def parse_new_details(self, root, mi, non_hero):
table = non_hero.xpath('descendant::table')[0] table = non_hero.xpath('descendant::table')[0]
@ -652,7 +665,7 @@ class Amazon(Source):
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:amazon', touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate', 'rating', 'comments', 'publisher', 'pubdate',
'languages', 'series']) 'languages', 'series'])
has_html_comments = True has_html_comments = True
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
@ -1110,8 +1123,8 @@ if __name__ == '__main__': # tests {{{
), ),
( # A newer book ( # A newer book
{'identifiers':{'isbn': '9780316044981'}}, {'identifiers':{'amazon': 'B004JHY6OG'}},
[title_test('The Heroes', exact=True), [title_test('The Heroes', exact=False),
authors_test(['Joe Abercrombie'])] authors_test(['Joe Abercrombie'])]
), ),
@ -1200,5 +1213,3 @@ if __name__ == '__main__': # tests {{{
# do_test('de') # do_test('de')
# }}} # }}}