Amazon metadata download: Fixes for various changes to amazon website markup

2025-07-31 14:33:54 -04:00 · 2024-01-03 20:52:01 +05:30 · 2024-01-03 20:52:01 +05:30 · 4526abb5e3
commit 4526abb5e3
parent dd33bd079a
1 changed files with 37 additions and 8 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -497,6 +497,7 @@ class Worker(Thread):  # Get details {{{
        non_hero = tuple(self.selector(
            'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector(
                '#productDetails_techSpec_sections'))
        feature_and_detail_bullets = root.xpath('//*[@data-feature-name="featureBulletsAndDetailBullets"]')
        if detail_bullets:
            self.parse_detail_bullets(root, mi, detail_bullets[0])
        elif non_hero:
@ -505,6 +506,8 @@ class Worker(Thread):  # Get details {{{
            except:
                self.log.exception(
                    'Failed to parse new-style book details section')
        elif feature_and_detail_bullets:
            self.parse_detail_bullets(root, mi, feature_and_detail_bullets[0], ul_selector='ul')
        else:
            pd = root.xpath(self.pd_xpath)
@ -674,6 +677,16 @@ class Worker(Thread):  # Get details {{{
                ans = parse_ratings_text(t)
                if ans is not None:
                    return ans
        else:
            # found in kindle book pages on amazon.com
            for x in root.xpath('//a[@id="acrCustomerReviewLink"]'):
                spans = x.xpath('./span')
                if spans:
                    txt = self.tostring(spans[0], method='text', encoding='unicode', with_tail=False).strip()
                    try:
                        return float(txt.replace(',', '.'))
                    except Exception:
                        pass
    def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html
@ -784,6 +797,20 @@ class Worker(Thread):  # Get details {{{
    def parse_series(self, root):
        ans = (None, None)
        # This is found on kindle pages for books on amazon.com
        series = root.xpath('//*[@id="rpi-attribute-book_details-series"]')
        if series:
            spans = series[0].xpath('descendant::span')
            if spans:
                texts = [self.tostring(x, encoding='unicode', method='text', with_tail=False).strip() for x in spans]
                texts = list(filter(None, texts))
                if len(texts) == 2:
                    idxinfo, series = texts
                    m = re.search(r'[0-9.]+', idxinfo.strip())
                    if m is not None:
                        ans = series, float(m.group())
                        return ans
        # This is found on the paperback/hardback pages for books on amazon.com
        series = root.xpath('//div[@data-feature-name="seriesTitle"]')
        if series:
@ -952,8 +979,11 @@ class Worker(Thread):  # Get details {{{
                if url:
                    return url
-    def parse_detail_bullets(self, root, mi, container):
+    def parse_detail_bullets(self, root, mi, container, ul_selector='.detail-bullet-list'):
-        ul = next(self.selector('.detail-bullet-list', root=container))
+        try:
            ul = next(self.selector(ul_selector, root=container))
        except StopIteration:
            return
        for span in self.selector('.a-list-item', root=ul):
            cells = span.xpath('./span')
            if len(cells) >= 2:
@ -1052,7 +1082,7 @@ class Worker(Thread):  # Get details {{{
 class Amazon(Source):
    name = 'Amazon.com'
-    version = (1, 3, 6)
+    version = (1, 3, 7)
    minimum_calibre_version = (2, 82, 0)
    description = _('Downloads metadata and covers from Amazon')
@ -1732,8 +1762,7 @@ def manual_tests(domain, **kw):  # {{{
    all_tests['com'] = [  # {{{
        (   # Paperback with series
            {'identifiers': {'amazon': '1423146786'}},
-            [title_test('The Blood of Olympus',
+            [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)]
                        exact=True), series_test('The Heroes of Olympus', 5)]
        ),
        (   # Kindle edition with series
@ -1752,7 +1781,7 @@ def manual_tests(domain, **kw):  # {{{
        (  # Different comments markup, using Book Description section
            {'identifiers': {'amazon': '0982514506'}},
            [title_test(
-                "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy",
+                "Griffin's Destiny",
                exact=True),
             comments_test('Jelena'), comments_test('Ashinji'),
             ]
@ -1768,8 +1797,8 @@ def manual_tests(domain, **kw):  # {{{
        (  # No specific problems
            {'identifiers': {'isbn': '0743273567'}},
-            [title_test('the great gatsby: the only authorized edition', exact=True),
+            [title_test('the great gatsby'),
-             authors_test(['Francis Scott Fitzgerald'])]
+             authors_test(['f. Scott Fitzgerald'])]
        ),
    ]