diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index e7ce36da74..e615c0b1b1 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -2,11 +2,14 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # License: GPLv3 Copyright: 2011, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals - import hashlib +import os import re -import time import regex +import sys +import tempfile +import time + try: from queue import Empty, Queue except ImportError: @@ -14,7 +17,7 @@ except ImportError: from calibre import as_unicode from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.metadata import check_isbn, authors_to_string +from calibre.ebooks.metadata import authors_to_string, check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.sources.base import Source from calibre.utils.cleantext import clean_ascii_chars @@ -55,13 +58,7 @@ def XPath(x): return ans -def cleanup_title(title): - if ':' in title: - return title.partition(':')[0] - return re.sub(r'(.+?) \(.+\)', r'\1', title) - - -def to_metadata(browser, log, entry_, timeout): # {{{ +def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{ from lxml import etree # total_results = XPath('//openSearch:totalResults') @@ -94,6 +91,10 @@ def to_metadata(browser, log, entry_, timeout): # {{{ def get_extra_details(): raw = get_details(browser, details_url, timeout) + if running_a_test: + with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f: + f.write(raw) + print('Book details saved to:', f.name, file=sys.stderr) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) @@ -186,7 +187,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ class GoogleBooks(Source): name = 'Google' - version = (1, 0, 6) + version = (1, 0, 7) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata and covers from Google Books') @@ -211,7 +212,7 @@ class GoogleBooks(Source): # }}} def id_from_url(self, url): # {{{ - from polyglot.urllib import urlparse, parse_qs + from polyglot.urllib import parse_qs, urlparse purl = urlparse(url) if purl.netloc == 'books.google.com': q = parse_qs(purl.query) @@ -332,6 +333,19 @@ class GoogleBooks(Source): # }}} + def postprocess_downloaded_google_metadata(self, ans, relevance=0): # {{{ + if not isinstance(ans, Metadata): + return ans + ans.source_relevance = relevance + goog = ans.identifiers['google'] + for isbn in getattr(ans, 'all_isbns', []): + self.cache_isbn_to_identifier(isbn, goog) + if getattr(ans, 'has_google_cover', False): + self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog) + self.clean_downloaded_metadata(ans) + return ans + # }}} + def get_all_details( # {{{ self, br, @@ -344,19 +358,10 @@ class GoogleBooks(Source): from lxml import etree for relevance, i in enumerate(entries): try: - ans = to_metadata(br, log, i, timeout) + ans = self.postprocess_downloaded_google_metadata(to_metadata(br, log, i, timeout, self.running_a_test), relevance) if isinstance(ans, Metadata): - ans.source_relevance = relevance - goog = ans.identifiers['google'] - for isbn in getattr(ans, 'all_isbns', []): - self.cache_isbn_to_identifier(isbn, goog) - if getattr(ans, 'has_google_cover', False): - self.cache_identifier_to_cover_url( - goog, self.GOOGLE_COVER % goog - ) - self.clean_downloaded_metadata(ans) result_queue.put(ans) - except: + except Exception: log.exception( 'Failed to get metadata for identify entry:', etree.tostring(i) ) @@ -378,6 +383,9 @@ class GoogleBooks(Source): isbn = check_isbn(identifiers.get('isbn', None)) q = [] strip_punc_pat = regex.compile(r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE) + google_ids = [] + check_tokens = set() + has_google_id = 'google' in identifiers def to_check_tokens(*tokens): for t in tokens: @@ -388,8 +396,9 @@ class GoogleBooks(Source): continue yield strip_punc_pat.sub('', t) - check_tokens = set() - if isbn is not None: + if has_google_id: + google_ids.append(identifiers['google']) + elif isbn is not None: q.append(isbn) elif title or authors: title_tokens = list(self.get_title_tokens(title)) @@ -400,22 +409,22 @@ class GoogleBooks(Source): if author_tokens: q += author_tokens check_tokens |= set(to_check_tokens(*author_tokens)) - if not q: + if not q and not google_ids: return None from calibre.ebooks.metadata.sources.update import search_engines_module se = search_engines_module() - url = se.google_format_query(q, tbm='bks') - log('Making query:', url) br = se.google_specialize_browser(se.browser()) - r = [] - root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append) - pat = re.compile(r'id=([^&]+)') - google_ids = [] - for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False): - m = pat.search(q.url) - if m is None or not q.url.startswith('https://books.google'): - continue - google_ids.append(m.group(1)) + if not has_google_id: + url = se.google_format_query(q, tbm='bks') + log('Making query:', url) + r = [] + root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append) + pat = re.compile(r'id=([^&]+)') + for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False): + m = pat.search(q.url) + if m is None or not q.url.startswith('https://books.google'): + continue + google_ids.append(m.group(1)) if not google_ids and isbn and (title or authors): return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout) @@ -426,7 +435,7 @@ class GoogleBooks(Source): continue seen.add(gid) try: - ans = to_metadata(br, log, gid, timeout) + ans = to_metadata(br, log, gid, timeout, self.running_a_test) if isinstance(ans, Metadata): if isbn: if isbn not in ans.all_isbns: @@ -439,23 +448,15 @@ class GoogleBooks(Source): if candidate.intersection(check_tokens) != check_tokens: log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query') continue - ans.source_relevance = relevance - goog = ans.identifiers['google'] - for isbnx in getattr(ans, 'all_isbns', []): - self.cache_isbn_to_identifier(isbnx, goog) - if getattr(ans, 'has_google_cover', False): - self.cache_identifier_to_cover_url( - goog, self.GOOGLE_COVER % goog - ) - self.clean_downloaded_metadata(ans) + ans = self.postprocess_downloaded_google_metadata(ans, relevance) result_queue.put(ans) found = True except: log.exception('Failed to get metadata for google books id:', gid) if abort.is_set(): break - if not found and isbn and (title or authors): - return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout) + if not found and isbn and (title or authors): + return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout) # }}} def identify( # {{{ @@ -468,11 +469,20 @@ class GoogleBooks(Source): identifiers={}, timeout=30 ): - if True: - return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout) - from lxml import etree entry = XPath('//atom:entry') + identifiers = identifiers.copy() + br = self.browser + if 'google' in identifiers: + try: + ans = to_metadata(br, log, identifiers['google'], timeout, self.running_a_test) + if isinstance(ans, Metadata): + self.postprocess_downloaded_google_metadata(ans) + result_queue.put(ans) + return + except Exception: + self.log.exception('Failed to get metadata for Google identifier:', identifiers['google']) + del identifiers['google'] query = self.create_query( title=title, authors=authors, identifiers=identifiers @@ -480,8 +490,6 @@ class GoogleBooks(Source): if not query: log.error('Insufficient metadata to construct query') return - alternate_query = self.create_query(title=title, authors=authors, identifiers=identifiers, capitalize_isbn=True) - br = self.browser def make_query(query): log('Making query:', query) @@ -503,34 +511,9 @@ class GoogleBooks(Source): ok, entries = make_query(query) if not ok: return entries - if not entries and alternate_query != query and not abort.is_set(): - log('No results found, retrying with capitalized ISBN') - ok, entries = make_query(alternate_query) - if not ok: - return entries - - if not entries and title and not abort.is_set(): - if identifiers: - log('No results found, retrying without identifiers') - return self.identify( - log, - result_queue, - abort, - title=title, - authors=authors, - timeout=timeout - ) - ntitle = cleanup_title(title) - if ntitle and ntitle != title: - log('No results found, retrying without sub-title') - return self.identify( - log, - result_queue, - abort, - title=ntitle, - authors=authors, - timeout=timeout - ) + if not entries and not abort.is_set(): + log('No results found, doing a web search instead') + return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout) # There is no point running these queries in threads as google # throttles requests returning 403 Forbidden errors @@ -540,12 +523,16 @@ class GoogleBooks(Source): if __name__ == '__main__': # tests {{{ - # To run these test use: calibre-debug - # src/calibre/ebooks/metadata/sources/google.py + # To run these test use: + # calibre-debug src/calibre/ebooks/metadata/sources/google.py from calibre.ebooks.metadata.sources.test import ( - test_identify_plugin, title_test, authors_test + authors_test, test_identify_plugin, title_test ) tests = [ + ({ + 'identifiers': {'google': 's7NIrgEACAAJ'}, + }, [title_test('Ride Every Stride', exact=False)]), + ({ 'identifiers': {'isbn': '0743273567'}, 'title': 'Great Gatsby', @@ -554,16 +541,19 @@ if __name__ == '__main__': # tests {{{ title_test('The great gatsby', exact=True), authors_test(['F. Scott Fitzgerald']) ]), - ({ + + ({ 'title': 'Flatland', 'authors': ['Abbott'] }, [title_test('Flatland', exact=False)]), + ({ 'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery', 'authors': ['David Handler'], }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery') ]), + ({ 'identifiers': {'isbn': '9781618246509'}, }, [ diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index 9c03c4a860..2a06d12cee 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -24,7 +24,7 @@ from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.lock import ExclusiveFile from calibre.utils.random_ua import accept_header_for_ua -current_version = (1, 0, 17) +current_version = (1, 0, 18) minimum_calibre_version = (2, 80, 0) @@ -308,7 +308,9 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True): def google_specialize_browser(br): - br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/') + if not hasattr(br, 'google_consent_cookie_added'): + br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/') + br.google_consent_cookie_added = True return br