From c31ff8f30e229995ff199dac859b8d6829986fab Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 28 Feb 2017 14:57:07 +0530
Subject: [PATCH] Google metadata download: Fix metadata not being found when
 the title of the book includes a sub-title

---
 src/calibre/ebooks/metadata/sources/google.py | 116 +++++++++++-------
 1 file changed, 70 insertions(+), 46 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py
index 8a03ccd96e..4f055c8ab0 100644
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@@ -5,7 +5,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import hashlib
 import time
-from functools import partial
 from Queue import Empty, Queue
 
 from calibre import as_unicode
@@ -41,9 +40,19 @@ def get_details(browser, url, timeout):  # {{{
 # }}}
 
 
+xpath_cache = {}
+
+
+def XPath(x):
+    ans = xpath_cache.get(x)
+    if ans is None:
+        from lxml import etree
+        ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)
+    return ans
+
+
 def to_metadata(browser, log, entry_, timeout):  # {{{
     from lxml import etree
-    XPath = partial(etree.XPath, namespaces=NAMESPACES)
 
     # total_results  = XPath('//openSearch:totalResults')
     # start_index    = XPath('//openSearch:startIndex')
@@ -58,7 +67,6 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
     subject = XPath('descendant::dc:subject')
     description = XPath('descendant::dc:description')
     language = XPath('descendant::dc:language')
-    rating = XPath('descendant::gd:rating[@average]')
     # print(etree.tostring(entry_, pretty_print=True))
 
     def get_text(extra, x):
@@ -138,15 +146,6 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
         except:
             log.error('Failed to parse pubdate %r' % pubdate)
 
-    # Ratings
-    for x in rating(extra):
-        try:
-            mi.rating = float(x.get('average'))
-            if mi.rating > 5:
-                mi.rating /= 2
-        except:
-            log.exception('Failed to parse rating')
-
     # Cover
     mi.has_google_cover = None
     for x in extra.xpath(
@@ -178,7 +177,8 @@ class GoogleBooks(Source):
 
     GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
 
-    DUMMY_IMAGE_MD5 = frozenset({'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'})
+    DUMMY_IMAGE_MD5 = frozenset(
+        {'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'})
 
     def get_book_url(self, identifiers):  # {{{
         goog = identifiers.get('google', None)
@@ -202,7 +202,8 @@ class GoogleBooks(Source):
             title_tokens = list(self.get_title_tokens(title))
             if title_tokens:
                 q += build_term('title', title_tokens)
-            author_tokens = self.get_author_tokens(authors, only_first_author=True)
+            author_tokens = self.get_author_tokens(
+                authors, only_first_author=True)
             if author_tokens:
                 q += ('+' if q else '') + build_term('author', author_tokens)
 
@@ -322,7 +323,8 @@ class GoogleBooks(Source):
                     result_queue.put(ans)
             except:
                 log.exception(
-                    'Failed to get metadata for identify entry:', etree.tostring(i)
+                    'Failed to get metadata for identify entry:', etree.tostring(
+                        i)
                 )
             if abort.is_set():
                 break
@@ -340,7 +342,6 @@ class GoogleBooks(Source):
         timeout=30
     ):
         from lxml import etree
-        XPath = partial(etree.XPath, namespaces=NAMESPACES)
         entry = XPath('//atom:entry')
 
         query = self.create_query(
@@ -350,7 +351,7 @@ class GoogleBooks(Source):
             log.error('Insufficient metadata to construct query')
             return
         br = self.browser
-        self.log('Making query:', query)
+        log('Making query:', query)
         try:
             raw = br.open_novisit(query, timeout=timeout).read()
         except Exception as e:
@@ -360,7 +361,8 @@ class GoogleBooks(Source):
         try:
             parser = etree.XMLParser(recover=True, no_network=True)
             feed = etree.fromstring(
-                xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
+                xml_to_unicode(clean_ascii_chars(
+                    raw), strip_encoding_pats=True)[0],
                 parser=parser
             )
             entries = entry(feed)
@@ -368,16 +370,29 @@ class GoogleBooks(Source):
             log.exception('Failed to parse identify results')
             return as_unicode(e)
 
-        if not entries and identifiers and title and authors and \
-                not abort.is_set():
-            return self.identify(
-                log,
-                result_queue,
-                abort,
-                title=title,
-                authors=authors,
-                timeout=timeout
-            )
+        if not entries and title and not abort.is_set():
+            if identifiers:
+                log('No results found, retrying without identifiers')
+                return self.identify(
+                    log,
+                    result_queue,
+                    abort,
+                    title=title,
+                    authors=authors,
+                    timeout=timeout
+                )
+            if ':' in title:
+                title = title.partition(':')[0]
+                if title:
+                    log('No results found, retrying without sub-title')
+                    return self.identify(
+                        log,
+                        result_queue,
+                        abort,
+                        title=title,
+                        authors=authors,
+                        timeout=timeout
+                    )
 
         # There is no point running these queries in threads as google
         # throttles requests returning 403 Forbidden errors
@@ -387,27 +402,36 @@ class GoogleBooks(Source):
 
 
 if __name__ == '__main__':  # tests {{{
-    # To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/google.py
+    # To run these test use: calibre-debug
+    # src/calibre/ebooks/metadata/sources/google.py
     from calibre.ebooks.metadata.sources.test import (
         test_identify_plugin, title_test, authors_test
     )
-    test_identify_plugin(
-        GoogleBooks.name, [
-            ({
-                'identifiers': {
-                    'isbn': '0743273567'
-                },
-                'title': 'Great Gatsby',
-                'authors': ['Fitzgerald']
-            }, [
-                title_test('The great gatsby', exact=True),
-                authors_test(['F. Scott Fitzgerald'])
-            ]),
-            ({
-                'title': 'Flatland',
-                'authors': ['Abbott']
-            }, [title_test('Flatland', exact=False)]),
+    tests = [
+        ({
+            'identifiers': {
+                'isbn': '0743273567'
+            },
+            'title': 'Great Gatsby',
+            'authors': ['Fitzgerald']
+        }, [
+            title_test('The great gatsby', exact=True),
+            authors_test(['F. Scott Fitzgerald'])
         ]
-    )
+        ),
+
+        ({
+            'title': 'Flatland',
+            'authors': ['Abbott']
+        }, [title_test('Flatland', exact=False)]
+        ),
+
+        ({
+            'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',
+            'authors': ['David Handler'],
+        }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')]
+        )
+    ]
+    test_identify_plugin(GoogleBooks.name, tests[:])
 
 # }}}