mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More amazon cover finding fixes
This commit is contained in:
parent
ca03e066a9
commit
03066f3283
@ -541,16 +541,34 @@ class Worker(Thread): # Get details {{{
|
|||||||
def parse_cover(self, root, raw=b""):
|
def parse_cover(self, root, raw=b""):
|
||||||
# Look for the image URL in javascript, using the first image in the
|
# Look for the image URL in javascript, using the first image in the
|
||||||
# image gallery as the cover
|
# image gallery as the cover
|
||||||
import json
|
|
||||||
imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""")
|
imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""")
|
||||||
for script in root.xpath('//script'):
|
for script in root.xpath('//script'):
|
||||||
m = imgpat.search(script.text or '')
|
m = imgpat.search(script.text or '')
|
||||||
if m is not None:
|
if m is not None:
|
||||||
|
import json
|
||||||
try:
|
try:
|
||||||
return json.loads(m.group(1))[0]['mainUrl']
|
return json.loads(m.group(1))[0]['mainUrl']
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
def clean_img_src(src):
|
||||||
|
parts = src.split('/')
|
||||||
|
if len(parts) > 3:
|
||||||
|
bn = parts[-1]
|
||||||
|
sparts = bn.split('_')
|
||||||
|
if len(sparts) > 2:
|
||||||
|
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
|
||||||
|
return ('/'.join(parts[:-1]))+'/'+bn
|
||||||
|
|
||||||
|
imgpat2 = re.compile(r'var imageSrc = "([^"]+)"')
|
||||||
|
for script in root.xpath('//script'):
|
||||||
|
m = imgpat2.search(script.text or '')
|
||||||
|
if m is not None:
|
||||||
|
src = m.group(1)
|
||||||
|
url = clean_img_src(src)
|
||||||
|
if url:
|
||||||
|
return url
|
||||||
|
|
||||||
imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
|
imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
|
||||||
if not imgs:
|
if not imgs:
|
||||||
imgs = root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]')
|
imgs = root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]')
|
||||||
@ -558,22 +576,17 @@ class Worker(Thread): # Get details {{{
|
|||||||
imgs = root.xpath('//div[@id="main-image-container"]//img[@src]')
|
imgs = root.xpath('//div[@id="main-image-container"]//img[@src]')
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
src = img.get('src')
|
src = img.get('src')
|
||||||
if src.startswith('data:'):
|
if 'data:' in src:
|
||||||
continue
|
continue
|
||||||
if 'loading-' in src:
|
if 'loading-' in src:
|
||||||
js_img = re.search(br'"largeImage":"(http://[^"]+)",',raw)
|
js_img = re.search(br'"largeImage":"(http://[^"]+)",',raw)
|
||||||
if js_img:
|
if js_img:
|
||||||
src = js_img.group(1).decode('utf-8')
|
src = js_img.group(1).decode('utf-8')
|
||||||
continue
|
|
||||||
if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):
|
if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):
|
||||||
self.log('Found image: %s' % src)
|
self.log('Found image: %s' % src)
|
||||||
parts = src.split('/')
|
url = clean_img_src(src)
|
||||||
if len(parts) > 3:
|
if url:
|
||||||
bn = parts[-1]
|
return url
|
||||||
sparts = bn.split('_')
|
|
||||||
if len(sparts) > 2:
|
|
||||||
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
|
|
||||||
return ('/'.join(parts[:-1]))+'/'+bn
|
|
||||||
|
|
||||||
def parse_new_details(self, root, mi, non_hero):
|
def parse_new_details(self, root, mi, non_hero):
|
||||||
table = non_hero.xpath('descendant::table')[0]
|
table = non_hero.xpath('descendant::table')[0]
|
||||||
@ -652,7 +665,7 @@ class Amazon(Source):
|
|||||||
|
|
||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||||
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate',
|
'rating', 'comments', 'publisher', 'pubdate',
|
||||||
'languages', 'series'])
|
'languages', 'series'])
|
||||||
has_html_comments = True
|
has_html_comments = True
|
||||||
supports_gzip_transfer_encoding = True
|
supports_gzip_transfer_encoding = True
|
||||||
@ -1110,8 +1123,8 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
|
|
||||||
( # A newer book
|
( # A newer book
|
||||||
{'identifiers':{'isbn': '9780316044981'}},
|
{'identifiers':{'amazon': 'B004JHY6OG'}},
|
||||||
[title_test('The Heroes', exact=True),
|
[title_test('The Heroes', exact=False),
|
||||||
authors_test(['Joe Abercrombie'])]
|
authors_test(['Joe Abercrombie'])]
|
||||||
|
|
||||||
),
|
),
|
||||||
@ -1200,5 +1213,3 @@ if __name__ == '__main__': # tests {{{
|
|||||||
# do_test('de')
|
# do_test('de')
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user