Metadata download from Google: Switch to making the queries via a Google web search as this finds some books that searching via the API does not. Fixes #1982502 [Google metadata download suboptimal search](https://bugs.launchpad.net/calibre/+bug/1982502)

2025-07-08 18:54:09 -04:00 · 2022-07-26 21:51:03 +05:30 · 2022-07-26 21:51:03 +05:30 · d19150f852
commit d19150f852
parent 25fb58adfe
1 changed files with 132 additions and 35 deletions
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -13,7 +13,7 @@ except ImportError:
 from calibre import as_unicode
 from calibre.ebooks.chardet import xml_to_unicode
-from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata import check_isbn, authors_to_string
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.utils.cleantext import clean_ascii_chars
@ -91,30 +91,39 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
            log.exception('Programming error:')
        return None
-    id_url = entry_id(entry_)[0].text
+    def get_extra_details():
    google_id = id_url.split('/')[-1]
    details_url = url(entry_)[0]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None
    mi = Metadata(title_, authors)
    mi.identifiers = {'google': google_id}
    try:
        raw = get_details(browser, details_url, timeout)
        feed = etree.fromstring(
            xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
            parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
        )
-        extra = entry(feed)[0]
+        return entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    if isinstance(entry_, str):
        google_id = entry_
        details_url = 'https://www.google.com/books/feeds/volumes/' + google_id
        extra = get_extra_details()
        title_ = ': '.join([x.text for x in title(extra)]).strip()
        authors = [x.text.strip() for x in creator(extra) if x.text]
    else:
        id_url = entry_id(entry_)[0].text
        google_id = id_url.split('/')[-1]
        details_url = url(entry_)[0]
        title_ = ': '.join([x.text for x in title(entry_)]).strip()
        authors = [x.text.strip() for x in creator(entry_) if x.text]
        if not id_url or not title:
            # Silently discard this entry
            return None
        extra = None
    if not authors:
        authors = [_('Unknown')]
    if not title:
        return None
    if extra is None:
        extra = get_extra_details()
    mi = Metadata(title_, authors)
    mi.identifiers = {'google': google_id}
    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
@ -176,7 +185,7 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
 class GoogleBooks(Source):
    name = 'Google'
-    version = (1, 0, 3)
+    version = (1, 0, 4)
    minimum_calibre_version = (2, 80, 0)
    description = _('Downloads metadata and covers from Google Books')
@ -355,6 +364,97 @@ class GoogleBooks(Source):
    # }}}
    def identify_via_web_search(  # {{{
        self,
        log,
        result_queue,
        abort,
        title=None,
        authors=None,
        identifiers={},
        timeout=30
    ):
        isbn = check_isbn(identifiers.get('isbn', None))
        q = []
        def to_check_tokens(*tokens):
            for t in tokens:
                if len(t) < 3:
                    continue
                t = t.lower()
                if t in ('and', 'not', 'the'):
                    continue
                yield t.strip(':')
        check_tokens = set()
        if isbn is not None:
            q.append(isbn)
        elif title or authors:
            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                q += title_tokens
                check_tokens |= set(to_check_tokens(*title_tokens))
            author_tokens = list(self.get_author_tokens(authors, only_first_author=True))
            if author_tokens:
                q += author_tokens
                check_tokens |= set(to_check_tokens(*author_tokens))
        if not q:
            return None
        from calibre.ebooks.metadata.sources.update import search_engines_module
        se = search_engines_module()
        url = se.google_format_query(q, tbm='bks')
        log('Making query:', url)
        br = se.google_specialize_browser(se.browser())
        r = []
        root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)
        pat = re.compile(r'id=([^&]+)')
        google_ids = []
        for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):
            m = pat.search(q.url)
            if m is None:
                continue
            google_ids.append(m.group(1))
        if not google_ids and isbn and (title or authors):
            return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
        found = False
        seen = set()
        for relevance, gid in enumerate(google_ids):
            if gid in seen:
                continue
            seen.add(gid)
            try:
                ans = to_metadata(br, log, gid, timeout)
                if isinstance(ans, Metadata):
                    if isbn:
                        if isbn not in ans.all_isbns:
                            log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,
                                'not in', ' '.join(ans.all_isbns))
                            continue
                    elif check_tokens:
                        candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))
                        candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))
                        if candidate.intersection(check_tokens) != check_tokens:
                            log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')
                            continue
                    ans.source_relevance = relevance
                    goog = ans.identifiers['google']
                    for isbn in getattr(ans, 'all_isbns', []):
                        self.cache_isbn_to_identifier(isbn, goog)
                    if getattr(ans, 'has_google_cover', False):
                        self.cache_identifier_to_cover_url(
                            goog, self.GOOGLE_COVER % goog
                        )
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
                    found = True
            except:
                log.exception('Failed to get metadata for google books id:', gid)
            if abort.is_set():
                break
            if not found and isbn and (title or authors):
                return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
    def identify(  # {{{
        self,
        log,
@ -365,6 +465,9 @@ class GoogleBooks(Source):
        identifiers={},
        timeout=30
    ):
        if True:
            return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
        from lxml import etree
        entry = XPath('//atom:entry')
@ -441,20 +544,7 @@ if __name__ == '__main__':  # tests {{{
    )
    tests = [
    ({
-        'identifiers': {
+        'identifiers': {'isbn': '0743273567'},
            'isbn': '978-0-7869-5437-7'  # needs capitalized ISBN to find results
        },
        'title': 'Dragons of Autumn Twilight',
        'authors': ['Margaret Weis', 'Tracy Hickman']
    }, [
        title_test('The great gatsby', exact=True),
        authors_test(['F. Scott Fitzgerald'])
    ]),
    ({
        'identifiers': {
            'isbn': '0743273567'
        },
        'title': 'Great Gatsby',
        'authors': ['Fitzgerald']
    }, [
@ -470,7 +560,14 @@ if __name__ == '__main__':  # tests {{{
        'The Blood Red Indian Summer: A Berger and Mitry Mystery',
        'authors': ['David Handler'],
    }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')
-    ])
+    ]),
    ({
        'identifiers': {'isbn': '9781618246509'},
    }, [
        title_test('The dragon done it', exact=True),
        authors_test(['Eric Flint', 'Mike Resnick'])
    ]),
    ]
    test_identify_plugin(GoogleBooks.name, tests[:])