mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Google books metadata download: Only do a web search when a search via the API returns no matches
This results in less queries to the google web search engine and hopefully allows people to download metadata for more books before google starts blocking their IPs.
This commit is contained in:
parent
2346792bad
commit
a7662ea9b1
@ -2,11 +2,14 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import regex
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
try:
|
||||
from queue import Empty, Queue
|
||||
except ImportError:
|
||||
@ -14,7 +17,7 @@ except ImportError:
|
||||
|
||||
from calibre import as_unicode
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import check_isbn, authors_to_string
|
||||
from calibre.ebooks.metadata import authors_to_string, check_isbn
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.metadata.sources.base import Source
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
@ -55,13 +58,7 @@ def XPath(x):
|
||||
return ans
|
||||
|
||||
|
||||
def cleanup_title(title):
|
||||
if ':' in title:
|
||||
return title.partition(':')[0]
|
||||
return re.sub(r'(.+?) \(.+\)', r'\1', title)
|
||||
|
||||
|
||||
def to_metadata(browser, log, entry_, timeout): # {{{
|
||||
def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
|
||||
from lxml import etree
|
||||
|
||||
# total_results = XPath('//openSearch:totalResults')
|
||||
@ -94,6 +91,10 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
||||
|
||||
def get_extra_details():
|
||||
raw = get_details(browser, details_url, timeout)
|
||||
if running_a_test:
|
||||
with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
|
||||
f.write(raw)
|
||||
print('Book details saved to:', f.name, file=sys.stderr)
|
||||
feed = etree.fromstring(
|
||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||
@ -186,7 +187,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
||||
class GoogleBooks(Source):
|
||||
|
||||
name = 'Google'
|
||||
version = (1, 0, 6)
|
||||
version = (1, 0, 7)
|
||||
minimum_calibre_version = (2, 80, 0)
|
||||
description = _('Downloads metadata and covers from Google Books')
|
||||
|
||||
@ -211,7 +212,7 @@ class GoogleBooks(Source):
|
||||
# }}}
|
||||
|
||||
def id_from_url(self, url): # {{{
|
||||
from polyglot.urllib import urlparse, parse_qs
|
||||
from polyglot.urllib import parse_qs, urlparse
|
||||
purl = urlparse(url)
|
||||
if purl.netloc == 'books.google.com':
|
||||
q = parse_qs(purl.query)
|
||||
@ -332,6 +333,19 @@ class GoogleBooks(Source):
|
||||
|
||||
# }}}
|
||||
|
||||
def postprocess_downloaded_google_metadata(self, ans, relevance=0): # {{{
|
||||
if not isinstance(ans, Metadata):
|
||||
return ans
|
||||
ans.source_relevance = relevance
|
||||
goog = ans.identifiers['google']
|
||||
for isbn in getattr(ans, 'all_isbns', []):
|
||||
self.cache_isbn_to_identifier(isbn, goog)
|
||||
if getattr(ans, 'has_google_cover', False):
|
||||
self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog)
|
||||
self.clean_downloaded_metadata(ans)
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
def get_all_details( # {{{
|
||||
self,
|
||||
br,
|
||||
@ -344,19 +358,10 @@ class GoogleBooks(Source):
|
||||
from lxml import etree
|
||||
for relevance, i in enumerate(entries):
|
||||
try:
|
||||
ans = to_metadata(br, log, i, timeout)
|
||||
ans = self.postprocess_downloaded_google_metadata(to_metadata(br, log, i, timeout, self.running_a_test), relevance)
|
||||
if isinstance(ans, Metadata):
|
||||
ans.source_relevance = relevance
|
||||
goog = ans.identifiers['google']
|
||||
for isbn in getattr(ans, 'all_isbns', []):
|
||||
self.cache_isbn_to_identifier(isbn, goog)
|
||||
if getattr(ans, 'has_google_cover', False):
|
||||
self.cache_identifier_to_cover_url(
|
||||
goog, self.GOOGLE_COVER % goog
|
||||
)
|
||||
self.clean_downloaded_metadata(ans)
|
||||
result_queue.put(ans)
|
||||
except:
|
||||
except Exception:
|
||||
log.exception(
|
||||
'Failed to get metadata for identify entry:', etree.tostring(i)
|
||||
)
|
||||
@ -378,6 +383,9 @@ class GoogleBooks(Source):
|
||||
isbn = check_isbn(identifiers.get('isbn', None))
|
||||
q = []
|
||||
strip_punc_pat = regex.compile(r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE)
|
||||
google_ids = []
|
||||
check_tokens = set()
|
||||
has_google_id = 'google' in identifiers
|
||||
|
||||
def to_check_tokens(*tokens):
|
||||
for t in tokens:
|
||||
@ -388,8 +396,9 @@ class GoogleBooks(Source):
|
||||
continue
|
||||
yield strip_punc_pat.sub('', t)
|
||||
|
||||
check_tokens = set()
|
||||
if isbn is not None:
|
||||
if has_google_id:
|
||||
google_ids.append(identifiers['google'])
|
||||
elif isbn is not None:
|
||||
q.append(isbn)
|
||||
elif title or authors:
|
||||
title_tokens = list(self.get_title_tokens(title))
|
||||
@ -400,22 +409,22 @@ class GoogleBooks(Source):
|
||||
if author_tokens:
|
||||
q += author_tokens
|
||||
check_tokens |= set(to_check_tokens(*author_tokens))
|
||||
if not q:
|
||||
if not q and not google_ids:
|
||||
return None
|
||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||
se = search_engines_module()
|
||||
url = se.google_format_query(q, tbm='bks')
|
||||
log('Making query:', url)
|
||||
br = se.google_specialize_browser(se.browser())
|
||||
r = []
|
||||
root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)
|
||||
pat = re.compile(r'id=([^&]+)')
|
||||
google_ids = []
|
||||
for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):
|
||||
m = pat.search(q.url)
|
||||
if m is None or not q.url.startswith('https://books.google'):
|
||||
continue
|
||||
google_ids.append(m.group(1))
|
||||
if not has_google_id:
|
||||
url = se.google_format_query(q, tbm='bks')
|
||||
log('Making query:', url)
|
||||
r = []
|
||||
root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)
|
||||
pat = re.compile(r'id=([^&]+)')
|
||||
for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):
|
||||
m = pat.search(q.url)
|
||||
if m is None or not q.url.startswith('https://books.google'):
|
||||
continue
|
||||
google_ids.append(m.group(1))
|
||||
|
||||
if not google_ids and isbn and (title or authors):
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
|
||||
@ -426,7 +435,7 @@ class GoogleBooks(Source):
|
||||
continue
|
||||
seen.add(gid)
|
||||
try:
|
||||
ans = to_metadata(br, log, gid, timeout)
|
||||
ans = to_metadata(br, log, gid, timeout, self.running_a_test)
|
||||
if isinstance(ans, Metadata):
|
||||
if isbn:
|
||||
if isbn not in ans.all_isbns:
|
||||
@ -439,23 +448,15 @@ class GoogleBooks(Source):
|
||||
if candidate.intersection(check_tokens) != check_tokens:
|
||||
log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')
|
||||
continue
|
||||
ans.source_relevance = relevance
|
||||
goog = ans.identifiers['google']
|
||||
for isbnx in getattr(ans, 'all_isbns', []):
|
||||
self.cache_isbn_to_identifier(isbnx, goog)
|
||||
if getattr(ans, 'has_google_cover', False):
|
||||
self.cache_identifier_to_cover_url(
|
||||
goog, self.GOOGLE_COVER % goog
|
||||
)
|
||||
self.clean_downloaded_metadata(ans)
|
||||
ans = self.postprocess_downloaded_google_metadata(ans, relevance)
|
||||
result_queue.put(ans)
|
||||
found = True
|
||||
except:
|
||||
log.exception('Failed to get metadata for google books id:', gid)
|
||||
if abort.is_set():
|
||||
break
|
||||
if not found and isbn and (title or authors):
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
|
||||
if not found and isbn and (title or authors):
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
|
||||
# }}}
|
||||
|
||||
def identify( # {{{
|
||||
@ -468,11 +469,20 @@ class GoogleBooks(Source):
|
||||
identifiers={},
|
||||
timeout=30
|
||||
):
|
||||
if True:
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
|
||||
|
||||
from lxml import etree
|
||||
entry = XPath('//atom:entry')
|
||||
identifiers = identifiers.copy()
|
||||
br = self.browser
|
||||
if 'google' in identifiers:
|
||||
try:
|
||||
ans = to_metadata(br, log, identifiers['google'], timeout, self.running_a_test)
|
||||
if isinstance(ans, Metadata):
|
||||
self.postprocess_downloaded_google_metadata(ans)
|
||||
result_queue.put(ans)
|
||||
return
|
||||
except Exception:
|
||||
self.log.exception('Failed to get metadata for Google identifier:', identifiers['google'])
|
||||
del identifiers['google']
|
||||
|
||||
query = self.create_query(
|
||||
title=title, authors=authors, identifiers=identifiers
|
||||
@ -480,8 +490,6 @@ class GoogleBooks(Source):
|
||||
if not query:
|
||||
log.error('Insufficient metadata to construct query')
|
||||
return
|
||||
alternate_query = self.create_query(title=title, authors=authors, identifiers=identifiers, capitalize_isbn=True)
|
||||
br = self.browser
|
||||
|
||||
def make_query(query):
|
||||
log('Making query:', query)
|
||||
@ -503,34 +511,9 @@ class GoogleBooks(Source):
|
||||
ok, entries = make_query(query)
|
||||
if not ok:
|
||||
return entries
|
||||
if not entries and alternate_query != query and not abort.is_set():
|
||||
log('No results found, retrying with capitalized ISBN')
|
||||
ok, entries = make_query(alternate_query)
|
||||
if not ok:
|
||||
return entries
|
||||
|
||||
if not entries and title and not abort.is_set():
|
||||
if identifiers:
|
||||
log('No results found, retrying without identifiers')
|
||||
return self.identify(
|
||||
log,
|
||||
result_queue,
|
||||
abort,
|
||||
title=title,
|
||||
authors=authors,
|
||||
timeout=timeout
|
||||
)
|
||||
ntitle = cleanup_title(title)
|
||||
if ntitle and ntitle != title:
|
||||
log('No results found, retrying without sub-title')
|
||||
return self.identify(
|
||||
log,
|
||||
result_queue,
|
||||
abort,
|
||||
title=ntitle,
|
||||
authors=authors,
|
||||
timeout=timeout
|
||||
)
|
||||
if not entries and not abort.is_set():
|
||||
log('No results found, doing a web search instead')
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
|
||||
|
||||
# There is no point running these queries in threads as google
|
||||
# throttles requests returning 403 Forbidden errors
|
||||
@ -540,12 +523,16 @@ class GoogleBooks(Source):
|
||||
|
||||
|
||||
if __name__ == '__main__': # tests {{{
|
||||
# To run these test use: calibre-debug
|
||||
# src/calibre/ebooks/metadata/sources/google.py
|
||||
# To run these test use:
|
||||
# calibre-debug src/calibre/ebooks/metadata/sources/google.py
|
||||
from calibre.ebooks.metadata.sources.test import (
|
||||
test_identify_plugin, title_test, authors_test
|
||||
authors_test, test_identify_plugin, title_test
|
||||
)
|
||||
tests = [
|
||||
({
|
||||
'identifiers': {'google': 's7NIrgEACAAJ'},
|
||||
}, [title_test('Ride Every Stride', exact=False)]),
|
||||
|
||||
({
|
||||
'identifiers': {'isbn': '0743273567'},
|
||||
'title': 'Great Gatsby',
|
||||
@ -554,16 +541,19 @@ if __name__ == '__main__': # tests {{{
|
||||
title_test('The great gatsby', exact=True),
|
||||
authors_test(['F. Scott Fitzgerald'])
|
||||
]),
|
||||
({
|
||||
|
||||
({
|
||||
'title': 'Flatland',
|
||||
'authors': ['Abbott']
|
||||
}, [title_test('Flatland', exact=False)]),
|
||||
|
||||
({
|
||||
'title':
|
||||
'The Blood Red Indian Summer: A Berger and Mitry Mystery',
|
||||
'authors': ['David Handler'],
|
||||
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')
|
||||
]),
|
||||
|
||||
({
|
||||
'identifiers': {'isbn': '9781618246509'},
|
||||
}, [
|
||||
|
@ -24,7 +24,7 @@ from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.lock import ExclusiveFile
|
||||
from calibre.utils.random_ua import accept_header_for_ua
|
||||
|
||||
current_version = (1, 0, 17)
|
||||
current_version = (1, 0, 18)
|
||||
minimum_calibre_version = (2, 80, 0)
|
||||
|
||||
|
||||
@ -308,7 +308,9 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
|
||||
|
||||
|
||||
def google_specialize_browser(br):
|
||||
br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/')
|
||||
if not hasattr(br, 'google_consent_cookie_added'):
|
||||
br.set_simple_cookie('CONSENT', 'YES+', '.google.com', path='/')
|
||||
br.google_consent_cookie_added = True
|
||||
return br
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user