Amazon metadata download: Fixes for various changes to amazon website markup

This commit is contained in:
Kovid Goyal 2024-01-03 20:52:01 +05:30
parent dd33bd079a
commit 4526abb5e3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -497,6 +497,7 @@ class Worker(Thread): # Get details {{{
non_hero = tuple(self.selector( non_hero = tuple(self.selector(
'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector( 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector(
'#productDetails_techSpec_sections')) '#productDetails_techSpec_sections'))
feature_and_detail_bullets = root.xpath('//*[@data-feature-name="featureBulletsAndDetailBullets"]')
if detail_bullets: if detail_bullets:
self.parse_detail_bullets(root, mi, detail_bullets[0]) self.parse_detail_bullets(root, mi, detail_bullets[0])
elif non_hero: elif non_hero:
@ -505,6 +506,8 @@ class Worker(Thread): # Get details {{{
except: except:
self.log.exception( self.log.exception(
'Failed to parse new-style book details section') 'Failed to parse new-style book details section')
elif feature_and_detail_bullets:
self.parse_detail_bullets(root, mi, feature_and_detail_bullets[0], ul_selector='ul')
else: else:
pd = root.xpath(self.pd_xpath) pd = root.xpath(self.pd_xpath)
@ -674,6 +677,16 @@ class Worker(Thread): # Get details {{{
ans = parse_ratings_text(t) ans = parse_ratings_text(t)
if ans is not None: if ans is not None:
return ans return ans
else:
# found in kindle book pages on amazon.com
for x in root.xpath('//a[@id="acrCustomerReviewLink"]'):
spans = x.xpath('./span')
if spans:
txt = self.tostring(spans[0], method='text', encoding='unicode', with_tail=False).strip()
try:
return float(txt.replace(',', '.'))
except Exception:
pass
def _render_comments(self, desc): def _render_comments(self, desc):
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
@ -784,6 +797,20 @@ class Worker(Thread): # Get details {{{
def parse_series(self, root): def parse_series(self, root):
ans = (None, None) ans = (None, None)
# This is found on kindle pages for books on amazon.com
series = root.xpath('//*[@id="rpi-attribute-book_details-series"]')
if series:
spans = series[0].xpath('descendant::span')
if spans:
texts = [self.tostring(x, encoding='unicode', method='text', with_tail=False).strip() for x in spans]
texts = list(filter(None, texts))
if len(texts) == 2:
idxinfo, series = texts
m = re.search(r'[0-9.]+', idxinfo.strip())
if m is not None:
ans = series, float(m.group())
return ans
# This is found on the paperback/hardback pages for books on amazon.com # This is found on the paperback/hardback pages for books on amazon.com
series = root.xpath('//div[@data-feature-name="seriesTitle"]') series = root.xpath('//div[@data-feature-name="seriesTitle"]')
if series: if series:
@ -952,8 +979,11 @@ class Worker(Thread): # Get details {{{
if url: if url:
return url return url
def parse_detail_bullets(self, root, mi, container): def parse_detail_bullets(self, root, mi, container, ul_selector='.detail-bullet-list'):
ul = next(self.selector('.detail-bullet-list', root=container)) try:
ul = next(self.selector(ul_selector, root=container))
except StopIteration:
return
for span in self.selector('.a-list-item', root=ul): for span in self.selector('.a-list-item', root=ul):
cells = span.xpath('./span') cells = span.xpath('./span')
if len(cells) >= 2: if len(cells) >= 2:
@ -1052,7 +1082,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 3, 6) version = (1, 3, 7)
minimum_calibre_version = (2, 82, 0) minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')
@ -1732,8 +1762,7 @@ def manual_tests(domain, **kw): # {{{
all_tests['com'] = [ # {{{ all_tests['com'] = [ # {{{
( # Paperback with series ( # Paperback with series
{'identifiers': {'amazon': '1423146786'}}, {'identifiers': {'amazon': '1423146786'}},
[title_test('The Blood of Olympus', [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)]
exact=True), series_test('The Heroes of Olympus', 5)]
), ),
( # Kindle edition with series ( # Kindle edition with series
@ -1752,7 +1781,7 @@ def manual_tests(domain, **kw): # {{{
( # Different comments markup, using Book Description section ( # Different comments markup, using Book Description section
{'identifiers': {'amazon': '0982514506'}}, {'identifiers': {'amazon': '0982514506'}},
[title_test( [title_test(
"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy", "Griffin's Destiny",
exact=True), exact=True),
comments_test('Jelena'), comments_test('Ashinji'), comments_test('Jelena'), comments_test('Ashinji'),
] ]
@ -1768,8 +1797,8 @@ def manual_tests(domain, **kw): # {{{
( # No specific problems ( # No specific problems
{'identifiers': {'isbn': '0743273567'}}, {'identifiers': {'isbn': '0743273567'}},
[title_test('the great gatsby: the only authorized edition', exact=True), [title_test('the great gatsby'),
authors_test(['Francis Scott Fitzgerald'])] authors_test(['f. Scott Fitzgerald'])]
), ),
] ]