Amazon metadata download: Get series information from the amazon.com page, when available

2025-11-30 10:15:02 -05:00 · 2016-06-05 11:46:17 +05:30 · 2016-06-05 11:46:17 +05:30 · cbfb38ffae
commit cbfb38ffae
parent 84c97d3cfa
1 changed files with 56 additions and 10 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -549,6 +549,35 @@ class Worker(Thread):  # Get details {{{

    def parse_series(self, root):
        ans = (None, None)
+
+        # This is found on the paperback/hardback pages for books on amazon.com
+        series = root.xpath('//div[@data-feature-name="seriesTitle"]')
+        if series:
+            series = series[0]
+            spans = series.xpath('./span')
+            if spans:
+                raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
+                m = re.search('\s+([0-9.]+)$', raw.strip())
+                if m is not None:
+                    series_index = float(m.group(1))
+                    s = series.xpath('./a[@id="series-page-link"]')
+                    if s:
+                        series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
+                        if series:
+                            ans = (series, series_index)
+        # This is found on Kindle edition pages on amazon.com
+        if ans == (None, None):
+            for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'):
+                text = (span.text or '').strip()
+                m = re.match('Book\s+([0-9.]+)', text)
+                if m is not None:
+                    series_index = float(m.group(1))
+                    a = span.xpath('./a[@href]')
+                    if a:
+                        series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
+                        if series:
+                            ans = (series, series_index)
+        if ans == (None, None):
            desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
            if desc:
                raw = self.tostring(desc[0], method='text', encoding=unicode)
@ -558,6 +587,9 @@ class Worker(Thread):  # Get details {{{
                    s, i = match.group('series'), float(match.group('index'))
                    if s:
                        ans = (s, i)
+        if ans[0]:
+            ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1])
+            ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1])
        return ans

    def parse_tags(self, root):
@ -838,6 +870,10 @@ class Amazon(Source):
            (mi.is_null('language') and self.domain in {'com', 'uk'})
        )
        if mi.title and docase:
+            # Remove series information from title
+            m = re.search(r'\S+\s+(\(.+?\s+Book\s+\d+\))$', mi.title)
+            if m is not None:
+                mi.title = mi.title.replace(m.group(1), '').strip()
            mi.title = fixcase(mi.title)
        mi.authors = fixauthors(mi.authors)
        if mi.tags and docase:
@ -1155,9 +1191,19 @@ class Amazon(Source):
 if __name__ == '__main__':  # tests {{{
    # To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
-            isbn_test, title_test, authors_test, comments_test)
+            isbn_test, title_test, authors_test, comments_test, series_test)
    com_tests = [  # {{{

+            (   # Paperback with series
+                {'identifiers':{'amazon':'1423146786'}},
+                [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)]
+            ),
+
+            (   # Kindle edition with series
+                {'identifiers':{'amazon':'B0085UEQDO'}},
+                [title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)]
+            ),
+
            (   # A kindle edition that does not appear in the search results when searching by ASIN
                {'identifiers':{'amazon':'B004JHY6OG'}},
                [title_test('The Heroes: A First Law Novel', exact=True)]