From 4526abb5e32d97e7772ec99cfc5f186b5e089837 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Jan 2024 20:52:01 +0530 Subject: [PATCH] Amazon metadata download: Fixes for various changes to amazon website markup --- src/calibre/ebooks/metadata/sources/amazon.py | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 539aa4c20c..14ac0bed50 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -497,6 +497,7 @@ class Worker(Thread): # Get details {{{ non_hero = tuple(self.selector( 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector( '#productDetails_techSpec_sections')) + feature_and_detail_bullets = root.xpath('//*[@data-feature-name="featureBulletsAndDetailBullets"]') if detail_bullets: self.parse_detail_bullets(root, mi, detail_bullets[0]) elif non_hero: @@ -505,6 +506,8 @@ class Worker(Thread): # Get details {{{ except: self.log.exception( 'Failed to parse new-style book details section') + elif feature_and_detail_bullets: + self.parse_detail_bullets(root, mi, feature_and_detail_bullets[0], ul_selector='ul') else: pd = root.xpath(self.pd_xpath) @@ -674,6 +677,16 @@ class Worker(Thread): # Get details {{{ ans = parse_ratings_text(t) if ans is not None: return ans + else: + # found in kindle book pages on amazon.com + for x in root.xpath('//a[@id="acrCustomerReviewLink"]'): + spans = x.xpath('./span') + if spans: + txt = self.tostring(spans[0], method='text', encoding='unicode', with_tail=False).strip() + try: + return float(txt.replace(',', '.')) + except Exception: + pass def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html @@ -784,6 +797,20 @@ class Worker(Thread): # Get details {{{ def parse_series(self, root): ans = (None, None) + # This is found on kindle pages for books on amazon.com + series = root.xpath('//*[@id="rpi-attribute-book_details-series"]') + if series: + spans = series[0].xpath('descendant::span') + if spans: + texts = [self.tostring(x, encoding='unicode', method='text', with_tail=False).strip() for x in spans] + texts = list(filter(None, texts)) + if len(texts) == 2: + idxinfo, series = texts + m = re.search(r'[0-9.]+', idxinfo.strip()) + if m is not None: + ans = series, float(m.group()) + return ans + # This is found on the paperback/hardback pages for books on amazon.com series = root.xpath('//div[@data-feature-name="seriesTitle"]') if series: @@ -952,8 +979,11 @@ class Worker(Thread): # Get details {{{ if url: return url - def parse_detail_bullets(self, root, mi, container): - ul = next(self.selector('.detail-bullet-list', root=container)) + def parse_detail_bullets(self, root, mi, container, ul_selector='.detail-bullet-list'): + try: + ul = next(self.selector(ul_selector, root=container)) + except StopIteration: + return for span in self.selector('.a-list-item', root=ul): cells = span.xpath('./span') if len(cells) >= 2: @@ -1052,7 +1082,7 @@ class Worker(Thread): # Get details {{{ class Amazon(Source): name = 'Amazon.com' - version = (1, 3, 6) + version = (1, 3, 7) minimum_calibre_version = (2, 82, 0) description = _('Downloads metadata and covers from Amazon') @@ -1732,8 +1762,7 @@ def manual_tests(domain, **kw): # {{{ all_tests['com'] = [ # {{{ ( # Paperback with series {'identifiers': {'amazon': '1423146786'}}, - [title_test('The Blood of Olympus', - exact=True), series_test('The Heroes of Olympus', 5)] + [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)] ), ( # Kindle edition with series @@ -1752,7 +1781,7 @@ def manual_tests(domain, **kw): # {{{ ( # Different comments markup, using Book Description section {'identifiers': {'amazon': '0982514506'}}, [title_test( - "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy", + "Griffin's Destiny", exact=True), comments_test('Jelena'), comments_test('Ashinji'), ] @@ -1768,8 +1797,8 @@ def manual_tests(domain, **kw): # {{{ ( # No specific problems {'identifiers': {'isbn': '0743273567'}}, - [title_test('the great gatsby: the only authorized edition', exact=True), - authors_test(['Francis Scott Fitzgerald'])] + [title_test('the great gatsby'), + authors_test(['f. Scott Fitzgerald'])] ), ]