diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 23b79dbd67..ac0ee3472a 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -13,7 +13,7 @@ except ImportError: from calibre import as_unicode from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata import check_isbn, authors_to_string from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.sources.base import Source from calibre.utils.cleantext import clean_ascii_chars @@ -91,30 +91,39 @@ def to_metadata(browser, log, entry_, timeout): # {{{ log.exception('Programming error:') return None - id_url = entry_id(entry_)[0].text - google_id = id_url.split('/')[-1] - details_url = url(entry_)[0] - title_ = ': '.join([x.text for x in title(entry_)]).strip() - authors = [x.text.strip() for x in creator(entry_) if x.text] - if not authors: - authors = [_('Unknown')] - if not id_url or not title: - # Silently discard this entry - return None - - mi = Metadata(title_, authors) - mi.identifiers = {'google': google_id} - try: + def get_extra_details(): raw = get_details(browser, details_url, timeout) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) ) - extra = entry(feed)[0] - except: - log.exception('Failed to get additional details for', mi.title) - return mi + return entry(feed)[0] + if isinstance(entry_, str): + google_id = entry_ + details_url = 'https://www.google.com/books/feeds/volumes/' + google_id + extra = get_extra_details() + title_ = ': '.join([x.text for x in title(extra)]).strip() + authors = [x.text.strip() for x in creator(extra) if x.text] + else: + id_url = entry_id(entry_)[0].text + google_id = id_url.split('/')[-1] + details_url = url(entry_)[0] + title_ = ': '.join([x.text for x in title(entry_)]).strip() + authors = [x.text.strip() for x in creator(entry_) if x.text] + if not id_url or not title: + # Silently discard this entry + return None + extra = None + + if not authors: + authors = [_('Unknown')] + if not title: + return None + if extra is None: + extra = get_extra_details() + mi = Metadata(title_, authors) + mi.identifiers = {'google': google_id} mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: @@ -176,7 +185,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ class GoogleBooks(Source): name = 'Google' - version = (1, 0, 3) + version = (1, 0, 4) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata and covers from Google Books') @@ -355,6 +364,97 @@ class GoogleBooks(Source): # }}} + def identify_via_web_search( # {{{ + self, + log, + result_queue, + abort, + title=None, + authors=None, + identifiers={}, + timeout=30 + ): + isbn = check_isbn(identifiers.get('isbn', None)) + q = [] + + def to_check_tokens(*tokens): + for t in tokens: + if len(t) < 3: + continue + t = t.lower() + if t in ('and', 'not', 'the'): + continue + yield t.strip(':') + + check_tokens = set() + if isbn is not None: + q.append(isbn) + elif title or authors: + title_tokens = list(self.get_title_tokens(title)) + if title_tokens: + q += title_tokens + check_tokens |= set(to_check_tokens(*title_tokens)) + author_tokens = list(self.get_author_tokens(authors, only_first_author=True)) + if author_tokens: + q += author_tokens + check_tokens |= set(to_check_tokens(*author_tokens)) + if not q: + return None + from calibre.ebooks.metadata.sources.update import search_engines_module + se = search_engines_module() + url = se.google_format_query(q, tbm='bks') + log('Making query:', url) + br = se.google_specialize_browser(se.browser()) + r = [] + root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append) + pat = re.compile(r'id=([^&]+)') + google_ids = [] + for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False): + m = pat.search(q.url) + if m is None: + continue + google_ids.append(m.group(1)) + + if not google_ids and isbn and (title or authors): + return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout) + found = False + seen = set() + for relevance, gid in enumerate(google_ids): + if gid in seen: + continue + seen.add(gid) + try: + ans = to_metadata(br, log, gid, timeout) + if isinstance(ans, Metadata): + if isbn: + if isbn not in ans.all_isbns: + log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn, + 'not in', ' '.join(ans.all_isbns)) + continue + elif check_tokens: + candidate = set(to_check_tokens(*self.get_title_tokens(ans.title))) + candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors))) + if candidate.intersection(check_tokens) != check_tokens: + log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query') + continue + ans.source_relevance = relevance + goog = ans.identifiers['google'] + for isbn in getattr(ans, 'all_isbns', []): + self.cache_isbn_to_identifier(isbn, goog) + if getattr(ans, 'has_google_cover', False): + self.cache_identifier_to_cover_url( + goog, self.GOOGLE_COVER % goog + ) + self.clean_downloaded_metadata(ans) + result_queue.put(ans) + found = True + except: + log.exception('Failed to get metadata for google books id:', gid) + if abort.is_set(): + break + if not found and isbn and (title or authors): + return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout) + def identify( # {{{ self, log, @@ -365,6 +465,9 @@ class GoogleBooks(Source): identifiers={}, timeout=30 ): + if True: + return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout) + from lxml import etree entry = XPath('//atom:entry') @@ -441,20 +544,7 @@ if __name__ == '__main__': # tests {{{ ) tests = [ ({ - 'identifiers': { - 'isbn': '978-0-7869-5437-7' # needs capitalized ISBN to find results - }, - 'title': 'Dragons of Autumn Twilight', - 'authors': ['Margaret Weis', 'Tracy Hickman'] - }, [ - title_test('The great gatsby', exact=True), - authors_test(['F. Scott Fitzgerald']) - ]), - - ({ - 'identifiers': { - 'isbn': '0743273567' - }, + 'identifiers': {'isbn': '0743273567'}, 'title': 'Great Gatsby', 'authors': ['Fitzgerald'] }, [ @@ -470,7 +560,14 @@ if __name__ == '__main__': # tests {{{ 'The Blood Red Indian Summer: A Berger and Mitry Mystery', 'authors': ['David Handler'], }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery') - ]) + ]), + ({ + 'identifiers': {'isbn': '9781618246509'}, + }, [ + title_test('The dragon done it', exact=True), + authors_test(['Eric Flint', 'Mike Resnick']) + ]), + ] test_identify_plugin(GoogleBooks.name, tests[:])