Also strip text in parentheses automatically from title when retrying google books metadata query

2025-11-26 00:05:01 -05:00 · 2017-03-10 09:12:53 +05:30 · 2017-03-10 09:12:53 +05:30 · d19c60f61d
commit d19c60f61d
parent 932884c6c4
1 changed files with 41 additions and 45 deletions
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -4,6 +4,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import hashlib
 import re
 import time
 from Queue import Empty, Queue
@ -39,7 +40,6 @@ def get_details(browser, url, timeout):  # {{{
 # }}}
 xpath_cache = {}
@ -51,6 +51,12 @@ def XPath(x):
    return ans
 def cleanup_title(title):
    if ':' in title:
        return title.partition(':')[0]
    return re.sub(r'(.+?) \(.+\)', r'\1', title)
 def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
@ -67,6 +73,7 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
    subject = XPath('descendant::dc:subject')
    description = XPath('descendant::dc:description')
    language = XPath('descendant::dc:language')
    # print(etree.tostring(entry_, pretty_print=True))
    def get_text(extra, x):
@ -178,7 +185,8 @@ class GoogleBooks(Source):
    GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
    DUMMY_IMAGE_MD5 = frozenset(
-        {'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'})
+        {'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'}
    )
    def get_book_url(self, identifiers):  # {{{
        goog = identifiers.get('google', None)
@ -202,8 +210,7 @@ class GoogleBooks(Source):
            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                q += build_term('title', title_tokens)
-            author_tokens = self.get_author_tokens(
+            author_tokens = self.get_author_tokens(authors, only_first_author=True)
                authors, only_first_author=True)
            if author_tokens:
                q += ('+' if q else '') + build_term('author', author_tokens)
@ -323,8 +330,7 @@ class GoogleBooks(Source):
                    result_queue.put(ans)
            except:
                log.exception(
-                    'Failed to get metadata for identify entry:', etree.tostring(
+                    'Failed to get metadata for identify entry:', etree.tostring(i)
                        i)
                )
            if abort.is_set():
                break
@ -361,8 +367,7 @@ class GoogleBooks(Source):
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(
-                xml_to_unicode(clean_ascii_chars(
+                xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                    raw), strip_encoding_pats=True)[0],
                parser=parser
            )
            entries = entry(feed)
@ -381,15 +386,14 @@ class GoogleBooks(Source):
                    authors=authors,
                    timeout=timeout
                )
-            if ':' in title:
+            ntitle = cleanup_title(title)
-                title = title.partition(':')[0]
+            if ntitle and ntitle != title:
                if title:
                log('No results found, retrying without sub-title')
                return self.identify(
                    log,
                    result_queue,
                    abort,
-                        title=title,
+                    title=ntitle,
                    authors=authors,
                    timeout=timeout
                )
@ -407,8 +411,7 @@ if __name__ == '__main__':  # tests {{{
    from calibre.ebooks.metadata.sources.test import (
        test_identify_plugin, title_test, authors_test
    )
-    tests = [
+    tests = [({
        ({
        'identifiers': {
            'isbn': '0743273567'
        },
@ -417,21 +420,14 @@ if __name__ == '__main__':  # tests {{{
    }, [
        title_test('The great gatsby', exact=True),
        authors_test(['F. Scott Fitzgerald'])
-        ]
+    ]), ({
        ),
        ({
        'title': 'Flatland',
        'authors': ['Abbott']
-        }, [title_test('Flatland', exact=False)]
+    }, [title_test('Flatland', exact=False)]), ({
-        ),
+        'title':
-
+        'The Blood Red Indian Summer: A Berger and Mitry Mystery',
        ({
            'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',
        'authors': ['David Handler'],
-        }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')]
+    }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')])]
        )
    ]
    test_identify_plugin(GoogleBooks.name, tests[:])
 # }}}