From 45e7e3f507cb42932fd19e26bb5694d1e6aee85a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 18 Feb 2012 14:04:41 +0530
Subject: [PATCH] Amazon metadata download: Try to scrape series information
 from the amazon details page. Note that currently very few books have series
 info available. Often the page for hardcover will have series, but the Kindle
 edition will not. In such cases calibre may or may not find the series,
 depending on which page it uses.

---
 src/calibre/ebooks/metadata/sources/amazon.py | 55 +++++++++++++++++--
 1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index cb724765f5..b4a210f131 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -156,6 +156,16 @@ class Worker(Thread): # Get details {{{
             for name in names:
                 self.lang_map[name] = code
 
+        self.series_pat = re.compile(
+                r'''
+                \|\s*              # Prefix
+                (Series)\s*:\s*    # Series declaration
+                (?P<series>.+?)\s+  # The series name
+                \((Book)\s*    # Book declaration
+                (?P<index>[0-9.]+) # Series index
+                \s*\)
+                ''', re.X)
+
     def delocalize_datestr(self, raw):
         if not self.months:
             return raw
@@ -265,6 +275,15 @@ class Worker(Thread): # Get details {{{
         except:
             self.log.exception('Error parsing comments for url: %r'%self.url)
 
+        try:
+            series, series_index = self.parse_series(root)
+            if series:
+                mi.series, mi.series_index = series, series_index
+            elif self.testing:
+                mi.series, mi.series_index = 'Dummy series for testing', 1
+        except:
+            self.log.exception('Error parsing series for url: %r'%self.url)
+
         try:
             self.cover_url = self.parse_cover(root)
         except:
@@ -398,6 +417,20 @@ class Worker(Thread): # Get details {{{
             ans += self._render_comments(desc[0])
         return ans
 
+    def parse_series(self, root):
+        ans = (None, None)
+        desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
+        if desc:
+            raw = self.tostring(desc[0], method='text', encoding=unicode)
+            raw = re.sub(r'\s+', ' ', raw)
+            match = self.series_pat.search(raw)
+            if match is not None:
+                s, i = match.group('series'), float(match.group('index'))
+                if s:
+                    ans = (s, i)
+        return ans
+
+
     def parse_cover(self, root):
         imgs = root.xpath('//img[@id="prodImage" and @src]')
         if imgs:
@@ -457,7 +490,7 @@ class Amazon(Source):
     capabilities = frozenset(['identify', 'cover'])
     touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
         'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate',
-        'languages'])
+        'languages', 'series'])
     has_html_comments = True
     supports_gzip_transfer_encoding = True
 
@@ -685,13 +718,15 @@ class Amazon(Source):
         from lxml.html import tostring
         import html5lib
 
+        testing = getattr(self, 'running_a_test', False)
+
         query, domain = self.create_query(log, title=title, authors=authors,
                 identifiers=identifiers)
         if query is None:
             log.error('Insufficient metadata to construct query')
             return
         br = self.browser
-        if getattr(self, 'running_a_test', False):
+        if testing:
             print ('Using user agent for amazon: %s'%self.user_agent)
         try:
             raw = br.open_novisit(query, timeout=timeout).read().strip()
@@ -714,7 +749,7 @@ class Amazon(Source):
         raw = clean_ascii_chars(xml_to_unicode(raw,
             strip_encoding_pats=True, resolve_entities=True)[0])
 
-        if getattr(self, 'running_a_test', False):
+        if testing:
             import tempfile
             with tempfile.NamedTemporaryFile(prefix='amazon_results_',
                     suffix='.html', delete=False) as f:
@@ -757,8 +792,7 @@ class Amazon(Source):
             return
 
         workers = [Worker(url, result_queue, br, log, i, domain, self,
-            testing=getattr(self, 'running_a_test', False)) for i, url in
-                enumerate(matches)]
+                            testing=testing) for i, url in enumerate(matches)]
 
         for w in workers:
             w.start()
@@ -820,9 +854,18 @@ if __name__ == '__main__': # tests {{{
     # To run these test use: calibre-debug -e
     # src/calibre/ebooks/metadata/sources/amazon.py
     from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
-            isbn_test, title_test, authors_test, comments_test)
+            isbn_test, title_test, authors_test, comments_test, series_test)
     com_tests = [ # {{{
 
+            ( # Series
+                {'identifiers':{'amazon':'0756407117'}},
+                [title_test(
+                "Throne of the Crescent Moon"
+                , exact=True), series_test('Crescent Moon Kingdoms', 1),
+                comments_test('Makhslood'),
+                ]
+            ),
+
             ( # Different comments markup, using Book Description section
                 {'identifiers':{'amazon':'0982514506'}},
                 [title_test(