Metadata download from Google: Switch to making the queries via a Google web search as this finds some books that searching via the API does not. Fixes #1982502 [Google metadata download suboptimal search](https://bugs.launchpad.net/calibre/+bug/1982502)

2025-07-08 18:54:09 -04:00 · 2022-07-26 21:51:03 +05:30 · 2022-07-26 21:51:03 +05:30 · d19150f852
commit d19150f852
parent 25fb58adfe
1 changed files with 132 additions and 35 deletions
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -13,7 +13,7 @@ except ImportError:

 from calibre import as_unicode
 from calibre.ebooks.chardet import xml_to_unicode
-from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata import check_isbn, authors_to_string
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.utils.cleantext import clean_ascii_chars
@ -91,30 +91,39 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
            log.exception('Programming error:')
        return None

-    id_url = entry_id(entry_)[0].text
-    google_id = id_url.split('/')[-1]
-    details_url = url(entry_)[0]
-    title_ = ': '.join([x.text for x in title(entry_)]).strip()
-    authors = [x.text.strip() for x in creator(entry_) if x.text]
-    if not authors:
-        authors = [_('Unknown')]
-    if not id_url or not title:
-        # Silently discard this entry
-        return None
-
-    mi = Metadata(title_, authors)
-    mi.identifiers = {'google': google_id}
-    try:
+    def get_extra_details():
        raw = get_details(browser, details_url, timeout)
        feed = etree.fromstring(
            xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
            parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
        )
-        extra = entry(feed)[0]
-    except:
-        log.exception('Failed to get additional details for', mi.title)
-        return mi
+        return entry(feed)[0]

+    if isinstance(entry_, str):
+        google_id = entry_
+        details_url = 'https://www.google.com/books/feeds/volumes/' + google_id
+        extra = get_extra_details()
+        title_ = ': '.join([x.text for x in title(extra)]).strip()
+        authors = [x.text.strip() for x in creator(extra) if x.text]
+    else:
+        id_url = entry_id(entry_)[0].text
+        google_id = id_url.split('/')[-1]
+        details_url = url(entry_)[0]
+        title_ = ': '.join([x.text for x in title(entry_)]).strip()
+        authors = [x.text.strip() for x in creator(entry_) if x.text]
+        if not id_url or not title:
+            # Silently discard this entry
+            return None
+        extra = None
+
+    if not authors:
+        authors = [_('Unknown')]
+    if not title:
+        return None
+    if extra is None:
+        extra = get_extra_details()
+    mi = Metadata(title_, authors)
+    mi.identifiers = {'google': google_id}
    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
@ -176,7 +185,7 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
 class GoogleBooks(Source):

    name = 'Google'
-    version = (1, 0, 3)
+    version = (1, 0, 4)
    minimum_calibre_version = (2, 80, 0)
    description = _('Downloads metadata and covers from Google Books')

@ -355,6 +364,97 @@ class GoogleBooks(Source):

    # }}}

+    def identify_via_web_search(  # {{{
+        self,
+        log,
+        result_queue,
+        abort,
+        title=None,
+        authors=None,
+        identifiers={},
+        timeout=30
+    ):
+        isbn = check_isbn(identifiers.get('isbn', None))
+        q = []
+
+        def to_check_tokens(*tokens):
+            for t in tokens:
+                if len(t) < 3:
+                    continue
+                t = t.lower()
+                if t in ('and', 'not', 'the'):
+                    continue
+                yield t.strip(':')
+
+        check_tokens = set()
+        if isbn is not None:
+            q.append(isbn)
+        elif title or authors:
+            title_tokens = list(self.get_title_tokens(title))
+            if title_tokens:
+                q += title_tokens
+                check_tokens |= set(to_check_tokens(*title_tokens))
+            author_tokens = list(self.get_author_tokens(authors, only_first_author=True))
+            if author_tokens:
+                q += author_tokens
+                check_tokens |= set(to_check_tokens(*author_tokens))
+        if not q:
+            return None
+        from calibre.ebooks.metadata.sources.update import search_engines_module
+        se = search_engines_module()
+        url = se.google_format_query(q, tbm='bks')
+        log('Making query:', url)
+        br = se.google_specialize_browser(se.browser())
+        r = []
+        root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)
+        pat = re.compile(r'id=([^&]+)')
+        google_ids = []
+        for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):
+            m = pat.search(q.url)
+            if m is None:
+                continue
+            google_ids.append(m.group(1))
+
+        if not google_ids and isbn and (title or authors):
+            return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
+        found = False
+        seen = set()
+        for relevance, gid in enumerate(google_ids):
+            if gid in seen:
+                continue
+            seen.add(gid)
+            try:
+                ans = to_metadata(br, log, gid, timeout)
+                if isinstance(ans, Metadata):
+                    if isbn:
+                        if isbn not in ans.all_isbns:
+                            log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,
+                                'not in', ' '.join(ans.all_isbns))
+                            continue
+                    elif check_tokens:
+                        candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))
+                        candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))
+                        if candidate.intersection(check_tokens) != check_tokens:
+                            log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')
+                            continue
+                    ans.source_relevance = relevance
+                    goog = ans.identifiers['google']
+                    for isbn in getattr(ans, 'all_isbns', []):
+                        self.cache_isbn_to_identifier(isbn, goog)
+                    if getattr(ans, 'has_google_cover', False):
+                        self.cache_identifier_to_cover_url(
+                            goog, self.GOOGLE_COVER % goog
+                        )
+                    self.clean_downloaded_metadata(ans)
+                    result_queue.put(ans)
+                    found = True
+            except:
+                log.exception('Failed to get metadata for google books id:', gid)
+            if abort.is_set():
+                break
+            if not found and isbn and (title or authors):
+                return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
+
    def identify(  # {{{
        self,
        log,
@ -365,6 +465,9 @@ class GoogleBooks(Source):
        identifiers={},
        timeout=30
    ):
+        if True:
+            return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
+
        from lxml import etree
        entry = XPath('//atom:entry')

@ -441,20 +544,7 @@ if __name__ == '__main__':  # tests {{{
    )
    tests = [
    ({
-        'identifiers': {
-            'isbn': '978-0-7869-5437-7'  # needs capitalized ISBN to find results
-        },
-        'title': 'Dragons of Autumn Twilight',
-        'authors': ['Margaret Weis', 'Tracy Hickman']
-    }, [
-        title_test('The great gatsby', exact=True),
-        authors_test(['F. Scott Fitzgerald'])
-    ]),
-
-    ({
-        'identifiers': {
-            'isbn': '0743273567'
-        },
+        'identifiers': {'isbn': '0743273567'},
        'title': 'Great Gatsby',
        'authors': ['Fitzgerald']
    }, [
@ -470,7 +560,14 @@ if __name__ == '__main__':  # tests {{{
        'The Blood Red Indian Summer: A Berger and Mitry Mystery',
        'authors': ['David Handler'],
    }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')
-    ])
+    ]),
+    ({
+        'identifiers': {'isbn': '9781618246509'},
+    }, [
+        title_test('The dragon done it', exact=True),
+        authors_test(['Eric Flint', 'Mike Resnick'])
+    ]),
+
    ]
    test_identify_plugin(GoogleBooks.name, tests[:])