Google books metadata download: Only do a web search when a search via the API returns no matches

This results in less queries to the google web search engine and
hopefully allows people to download metadata for more books before
google starts blocking their IPs.
This commit is contained in:
Kovid Goyal 2022-07-31 09:04:53 +05:30
parent 2346792bad
commit a7662ea9b1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 80 additions and 88 deletions

View File

@ -2,11 +2,14 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import hashlib import hashlib
import os
import re import re
import time
import regex import regex
import sys
import tempfile
import time
try: try:
from queue import Empty, Queue from queue import Empty, Queue
except ImportError: except ImportError:
@ -14,7 +17,7 @@ except ImportError:
from calibre import as_unicode from calibre import as_unicode
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import check_isbn, authors_to_string from calibre.ebooks.metadata import authors_to_string, check_isbn
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
@ -55,13 +58,7 @@ def XPath(x):
return ans return ans
def cleanup_title(title): def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
if ':' in title:
return title.partition(':')[0]
return re.sub(r'(.+?) \(.+\)', r'\1', title)
def to_metadata(browser, log, entry_, timeout): # {{{
from lxml import etree from lxml import etree
# total_results = XPath('//openSearch:totalResults') # total_results = XPath('//openSearch:totalResults')
@ -94,6 +91,10 @@ def to_metadata(browser, log, entry_, timeout): # {{{
def get_extra_details(): def get_extra_details():
raw = get_details(browser, details_url, timeout) raw = get_details(browser, details_url, timeout)
if running_a_test:
with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
f.write(raw)
print('Book details saved to:', f.name, file=sys.stderr)
feed = etree.fromstring( feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
@ -186,7 +187,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
class GoogleBooks(Source): class GoogleBooks(Source):
name = 'Google' name = 'Google'
version = (1, 0, 6) version = (1, 0, 7)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Google Books') description = _('Downloads metadata and covers from Google Books')
@ -211,7 +212,7 @@ class GoogleBooks(Source):
# }}} # }}}
def id_from_url(self, url): # {{{ def id_from_url(self, url): # {{{
from polyglot.urllib import urlparse, parse_qs from polyglot.urllib import parse_qs, urlparse
purl = urlparse(url) purl = urlparse(url)
if purl.netloc == 'books.google.com': if purl.netloc == 'books.google.com':
q = parse_qs(purl.query) q = parse_qs(purl.query)
@ -332,6 +333,19 @@ class GoogleBooks(Source):
# }}} # }}}
def postprocess_downloaded_google_metadata(self, ans, relevance=0): # {{{
if not isinstance(ans, Metadata):
return ans
ans.source_relevance = relevance
goog = ans.identifiers['google']
for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, goog)
if getattr(ans, 'has_google_cover', False):
self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog)
self.clean_downloaded_metadata(ans)
return ans
# }}}
def get_all_details( # {{{ def get_all_details( # {{{
self, self,
br, br,
@ -344,19 +358,10 @@ class GoogleBooks(Source):
from lxml import etree from lxml import etree
for relevance, i in enumerate(entries): for relevance, i in enumerate(entries):
try: try:
ans = to_metadata(br, log, i, timeout) ans = self.postprocess_downloaded_google_metadata(to_metadata(br, log, i, timeout, self.running_a_test), relevance)
if isinstance(ans, Metadata): if isinstance(ans, Metadata):
ans.source_relevance = relevance
goog = ans.identifiers['google']
for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, goog)
if getattr(ans, 'has_google_cover', False):
self.cache_identifier_to_cover_url(
goog, self.GOOGLE_COVER % goog
)
self.clean_downloaded_metadata(ans)
result_queue.put(ans) result_queue.put(ans)
except: except Exception:
log.exception( log.exception(
'Failed to get metadata for identify entry:', etree.tostring(i) 'Failed to get metadata for identify entry:', etree.tostring(i)
) )
@ -378,6 +383,9 @@ class GoogleBooks(Source):
isbn = check_isbn(identifiers.get('isbn', None)) isbn = check_isbn(identifiers.get('isbn', None))
q = [] q = []
strip_punc_pat = regex.compile(r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE) strip_punc_pat = regex.compile(r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE)
google_ids = []
check_tokens = set()
has_google_id = 'google' in identifiers
def to_check_tokens(*tokens): def to_check_tokens(*tokens):
for t in tokens: for t in tokens:
@ -388,8 +396,9 @@ class GoogleBooks(Source):
continue continue
yield strip_punc_pat.sub('', t) yield strip_punc_pat.sub('', t)
check_tokens = set() if has_google_id:
if isbn is not None: google_ids.append(identifiers['google'])
elif isbn is not None:
q.append(isbn) q.append(isbn)
elif title or authors: elif title or authors:
title_tokens = list(self.get_title_tokens(title)) title_tokens = list(self.get_title_tokens(title))
@ -400,17 +409,17 @@ class GoogleBooks(Source):
if author_tokens: if author_tokens:
q += author_tokens q += author_tokens
check_tokens |= set(to_check_tokens(*author_tokens)) check_tokens |= set(to_check_tokens(*author_tokens))
if not q: if not q and not google_ids:
return None return None
from calibre.ebooks.metadata.sources.update import search_engines_module from calibre.ebooks.metadata.sources.update import search_engines_module
se = search_engines_module() se = search_engines_module()
br = se.google_specialize_browser(se.browser())
if not has_google_id:
url = se.google_format_query(q, tbm='bks') url = se.google_format_query(q, tbm='bks')
log('Making query:', url) log('Making query:', url)
br = se.google_specialize_browser(se.browser())
r = [] r = []
root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append) root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)
pat = re.compile(r'id=([^&]+)') pat = re.compile(r'id=([^&]+)')
google_ids = []
for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False): for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):
m = pat.search(q.url) m = pat.search(q.url)
if m is None or not q.url.startswith('https://books.google'): if m is None or not q.url.startswith('https://books.google'):
@ -426,7 +435,7 @@ class GoogleBooks(Source):
continue continue
seen.add(gid) seen.add(gid)
try: try:
ans = to_metadata(br, log, gid, timeout) ans = to_metadata(br, log, gid, timeout, self.running_a_test)
if isinstance(ans, Metadata): if isinstance(ans, Metadata):
if isbn: if isbn:
if isbn not in ans.all_isbns: if isbn not in ans.all_isbns:
@ -439,15 +448,7 @@ class GoogleBooks(Source):
if candidate.intersection(check_tokens) != check_tokens: if candidate.intersection(check_tokens) != check_tokens:
log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query') log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')
continue continue
ans.source_relevance = relevance ans = self.postprocess_downloaded_google_metadata(ans, relevance)
goog = ans.identifiers['google']
for isbnx in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbnx, goog)
if getattr(ans, 'has_google_cover', False):
self.cache_identifier_to_cover_url(
goog, self.GOOGLE_COVER % goog
)
self.clean_downloaded_metadata(ans)
result_queue.put(ans) result_queue.put(ans)
found = True found = True
except: except:
@ -468,11 +469,20 @@ class GoogleBooks(Source):
identifiers={}, identifiers={},
timeout=30 timeout=30
): ):
if True:
return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
from lxml import etree from lxml import etree
entry = XPath('//atom:entry') entry = XPath('//atom:entry')
identifiers = identifiers.copy()
br = self.browser
if 'google' in identifiers:
try:
ans = to_metadata(br, log, identifiers['google'], timeout, self.running_a_test)
if isinstance(ans, Metadata):
self.postprocess_downloaded_google_metadata(ans)
result_queue.put(ans)
return
except Exception:
self.log.exception('Failed to get metadata for Google identifier:', identifiers['google'])
del identifiers['google']
query = self.create_query( query = self.create_query(
title=title, authors=authors, identifiers=identifiers title=title, authors=authors, identifiers=identifiers
@ -480,8 +490,6 @@ class GoogleBooks(Source):
if not query: if not query:
log.error('Insufficient metadata to construct query') log.error('Insufficient metadata to construct query')
return return
alternate_query = self.create_query(title=title, authors=authors, identifiers=identifiers, capitalize_isbn=True)
br = self.browser
def make_query(query): def make_query(query):
log('Making query:', query) log('Making query:', query)
@ -503,34 +511,9 @@ class GoogleBooks(Source):
ok, entries = make_query(query) ok, entries = make_query(query)
if not ok: if not ok:
return entries return entries
if not entries and alternate_query != query and not abort.is_set(): if not entries and not abort.is_set():
log('No results found, retrying with capitalized ISBN') log('No results found, doing a web search instead')
ok, entries = make_query(alternate_query) return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
if not ok:
return entries
if not entries and title and not abort.is_set():
if identifiers:
log('No results found, retrying without identifiers')
return self.identify(
log,
result_queue,
abort,
title=title,
authors=authors,
timeout=timeout
)
ntitle = cleanup_title(title)
if ntitle and ntitle != title:
log('No results found, retrying without sub-title')
return self.identify(
log,
result_queue,
abort,
title=ntitle,
authors=authors,
timeout=timeout
)
# There is no point running these queries in threads as google # There is no point running these queries in threads as google
# throttles requests returning 403 Forbidden errors # throttles requests returning 403 Forbidden errors
@ -540,12 +523,16 @@ class GoogleBooks(Source):
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug # To run these test use:
# src/calibre/ebooks/metadata/sources/google.py # calibre-debug src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import ( from calibre.ebooks.metadata.sources.test import (
test_identify_plugin, title_test, authors_test authors_test, test_identify_plugin, title_test
) )
tests = [ tests = [
({
'identifiers': {'google': 's7NIrgEACAAJ'},
}, [title_test('Ride Every Stride', exact=False)]),
({ ({
'identifiers': {'isbn': '0743273567'}, 'identifiers': {'isbn': '0743273567'},
'title': 'Great Gatsby', 'title': 'Great Gatsby',
@ -554,16 +541,19 @@ if __name__ == '__main__': # tests {{{
title_test('The great gatsby', exact=True), title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald']) authors_test(['F. Scott Fitzgerald'])
]), ]),
({ ({
'title': 'Flatland', 'title': 'Flatland',
'authors': ['Abbott'] 'authors': ['Abbott']
}, [title_test('Flatland', exact=False)]), }, [title_test('Flatland', exact=False)]),
({ ({
'title': 'title':
'The Blood Red Indian Summer: A Berger and Mitry Mystery', 'The Blood Red Indian Summer: A Berger and Mitry Mystery',
'authors': ['David Handler'], 'authors': ['David Handler'],
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery') }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')
]), ]),
({ ({
'identifiers': {'isbn': '9781618246509'}, 'identifiers': {'isbn': '9781618246509'},
}, [ }, [

View File

@ -24,7 +24,7 @@ from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.lock import ExclusiveFile from calibre.utils.lock import ExclusiveFile
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 0, 17) current_version = (1, 0, 18)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
@ -308,7 +308,9 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
def google_specialize_browser(br): def google_specialize_browser(br):
if not hasattr(br, 'google_consent_cookie_added'):
br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/') br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/')
br.google_consent_cookie_added = True
return br return br