From 18fd3ae063794fc1c6f7932ab166dd137a06aeef Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 27 Apr 2019 12:05:24 +0530
Subject: [PATCH] Amazon metadata: Make title processing more robust

Also make it easier to run tests selectively from the command line
---
 src/calibre/ebooks/metadata/sources/amazon.py | 62 ++++++++++++-------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index 44c6f19420..5e2a6ce822 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -475,13 +475,26 @@ class Worker(Thread):  # Get details {{{
         return self.tostring(elem, encoding='unicode', method='text').strip()
 
     def parse_title(self, root):
+
+        def sanitize_title(title):
+            ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
+            if not ans:
+                ans = title.rpartition('[')[0].strip()
+            return ans
+
         h1 = root.xpath('//h1[@id="title"]')
         if h1:
             h1 = h1[0]
             for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'):
                 h1.remove(child)
-            return self.totext(h1)
-        tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
+            return sanitize_title(self.totext(h1))
+        tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')
+        if not tdiv:
+            span = root.xpath('//*[id="ebooksTitle"]')
+            if span:
+                return sanitize_title(self.totext(span[0]))
+            raise ValueError('No title block found')
+        tdiv = tdiv[0]
         actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
         if actual_title:
             title = self.tostring(actual_title[0], encoding='unicode',
@@ -489,10 +502,7 @@ class Worker(Thread):  # Get details {{{
         else:
             title = self.tostring(tdiv, encoding='unicode',
                                   method='text').strip()
-        ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
-        if not ans:
-            ans = title.rpartition('[')[0].strip()
-        return ans
+        return sanitize_title(title)
 
     def parse_authors(self, root):
         for sel in (
@@ -851,7 +861,7 @@ class Worker(Thread):  # Get details {{{
 class Amazon(Source):
 
     name = 'Amazon.com'
-    version = (1, 2, 6)
+    version = (1, 2, 7)
     minimum_calibre_version = (2, 82, 0)
     description = _('Downloads metadata and covers from Amazon')
 
@@ -1480,12 +1490,13 @@ class Amazon(Source):
     # }}}
 
 
-if __name__ == '__main__':  # tests {{{
-    # To run these test use: calibre-debug
-    # src/calibre/ebooks/metadata/sources/amazon.py
+def manual_tests(domain, **kw):  # {{{
+    # To run these test use:
+    # calibre-debug -c "from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')"
     from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
                                                       isbn_test, title_test, authors_test, comments_test, series_test)
-    com_tests = [  # {{{
+    all_tests = {}
+    all_tests['com'] = [  # {{{
 
         (   # Paperback with series
             {'identifiers': {'amazon': '1423146786'}},
@@ -1533,7 +1544,7 @@ if __name__ == '__main__':  # tests {{{
 
     # }}}
 
-    de_tests = [  # {{{
+    all_tests['de'] = [  # {{{
         (
             {'identifiers': {'isbn': '9783453314979'}},
             [title_test('Die letzten Wächter: Roman',
@@ -1551,7 +1562,7 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    it_tests = [  # {{{
+    all_tests['it'] = [  # {{{
         (
             {'identifiers': {'isbn': '8838922195'}},
             [title_test('La briscola in cinque',
@@ -1561,7 +1572,13 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    fr_tests = [  # {{{
+    all_tests['fr'] = [  # {{{
+        (
+            {'identifiers': {'amazon_fr': 'B07L7ST4RS'}},
+            [title_test('Le secret de Lola', exact=True),
+                authors_test(['Amélie BRIZIO'])
+            ]
+        ),
         (
             {'identifiers': {'isbn': '2221116798'}},
             [title_test('L\'étrange voyage de Monsieur Daldry',
@@ -1571,7 +1588,7 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    es_tests = [  # {{{
+    all_tests['es'] = [  # {{{
         (
             {'identifiers': {'isbn': '8483460831'}},
             [title_test('Tiempos Interesantes',
@@ -1581,7 +1598,7 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    jp_tests = [  # {{{
+    all_tests['jp'] = [  # {{{
         (  # Adult filtering test
             {'identifiers': {'isbn': '4799500066'}},
             [title_test(u'Ｂｉｔｃｈ Ｔｒａｐ'), ]
@@ -1600,7 +1617,7 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    br_tests = [  # {{{
+    all_tests['br'] = [  # {{{
         (
             {'title': 'Guerra dos Tronos'},
             [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
@@ -1610,7 +1627,7 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    nl_tests = [  # {{{
+    all_tests['nl'] = [  # {{{
         (
             {'title': 'Freakonomics'},
             [title_test('Freakonomics',
@@ -1620,7 +1637,7 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    cn_tests = [  # {{{
+    all_tests['cn'] = [  # {{{
         (
             {'identifiers': {'isbn': '9787115369512'}},
             [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
@@ -1635,7 +1652,7 @@ if __name__ == '__main__':  # tests {{{
         ),
     ]  # }}}
 
-    ca_tests = [  # {{{
+    all_tests['ca'] = [  # {{{
         (   # Paperback with series
             {'identifiers': {'isbn': '9781623808747'}},
             [title_test('Parting Shot', exact=True),
@@ -1655,7 +1672,7 @@ if __name__ == '__main__':  # tests {{{
     ]  # }}}
 
     def do_test(domain, start=0, stop=None, server='auto'):
-        tests = globals().get(domain + '_tests')
+        tests = all_tests[domain]
         if stop is None:
             stop = len(tests)
         tests = tests[start:stop]
@@ -1665,6 +1682,5 @@ if __name__ == '__main__':  # tests {{{
             setattr(p, 'testing_server', server),
         ))
 
-    do_test('com')
-    # do_test('de')
+    do_test(domain, **kw)
 # }}}