mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Metadata download from Google: Switch to making the queries via a Google web search as this finds some books that searching via the API does not. Fixes #1982502 [Google metadata download suboptimal search](https://bugs.launchpad.net/calibre/+bug/1982502)
This commit is contained in:
parent
25fb58adfe
commit
d19150f852
@ -13,7 +13,7 @@ except ImportError:
|
||||
|
||||
from calibre import as_unicode
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
from calibre.ebooks.metadata import check_isbn, authors_to_string
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.metadata.sources.base import Source
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
@ -91,30 +91,39 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
||||
log.exception('Programming error:')
|
||||
return None
|
||||
|
||||
id_url = entry_id(entry_)[0].text
|
||||
google_id = id_url.split('/')[-1]
|
||||
details_url = url(entry_)[0]
|
||||
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||
if not authors:
|
||||
authors = [_('Unknown')]
|
||||
if not id_url or not title:
|
||||
# Silently discard this entry
|
||||
return None
|
||||
|
||||
mi = Metadata(title_, authors)
|
||||
mi.identifiers = {'google': google_id}
|
||||
try:
|
||||
def get_extra_details():
|
||||
raw = get_details(browser, details_url, timeout)
|
||||
feed = etree.fromstring(
|
||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||
)
|
||||
extra = entry(feed)[0]
|
||||
except:
|
||||
log.exception('Failed to get additional details for', mi.title)
|
||||
return mi
|
||||
return entry(feed)[0]
|
||||
|
||||
if isinstance(entry_, str):
|
||||
google_id = entry_
|
||||
details_url = 'https://www.google.com/books/feeds/volumes/' + google_id
|
||||
extra = get_extra_details()
|
||||
title_ = ': '.join([x.text for x in title(extra)]).strip()
|
||||
authors = [x.text.strip() for x in creator(extra) if x.text]
|
||||
else:
|
||||
id_url = entry_id(entry_)[0].text
|
||||
google_id = id_url.split('/')[-1]
|
||||
details_url = url(entry_)[0]
|
||||
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||
if not id_url or not title:
|
||||
# Silently discard this entry
|
||||
return None
|
||||
extra = None
|
||||
|
||||
if not authors:
|
||||
authors = [_('Unknown')]
|
||||
if not title:
|
||||
return None
|
||||
if extra is None:
|
||||
extra = get_extra_details()
|
||||
mi = Metadata(title_, authors)
|
||||
mi.identifiers = {'google': google_id}
|
||||
mi.comments = get_text(extra, description)
|
||||
lang = canonicalize_lang(get_text(extra, language))
|
||||
if lang:
|
||||
@ -176,7 +185,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
||||
class GoogleBooks(Source):
|
||||
|
||||
name = 'Google'
|
||||
version = (1, 0, 3)
|
||||
version = (1, 0, 4)
|
||||
minimum_calibre_version = (2, 80, 0)
|
||||
description = _('Downloads metadata and covers from Google Books')
|
||||
|
||||
@ -355,6 +364,97 @@ class GoogleBooks(Source):
|
||||
|
||||
# }}}
|
||||
|
||||
def identify_via_web_search( # {{{
|
||||
self,
|
||||
log,
|
||||
result_queue,
|
||||
abort,
|
||||
title=None,
|
||||
authors=None,
|
||||
identifiers={},
|
||||
timeout=30
|
||||
):
|
||||
isbn = check_isbn(identifiers.get('isbn', None))
|
||||
q = []
|
||||
|
||||
def to_check_tokens(*tokens):
|
||||
for t in tokens:
|
||||
if len(t) < 3:
|
||||
continue
|
||||
t = t.lower()
|
||||
if t in ('and', 'not', 'the'):
|
||||
continue
|
||||
yield t.strip(':')
|
||||
|
||||
check_tokens = set()
|
||||
if isbn is not None:
|
||||
q.append(isbn)
|
||||
elif title or authors:
|
||||
title_tokens = list(self.get_title_tokens(title))
|
||||
if title_tokens:
|
||||
q += title_tokens
|
||||
check_tokens |= set(to_check_tokens(*title_tokens))
|
||||
author_tokens = list(self.get_author_tokens(authors, only_first_author=True))
|
||||
if author_tokens:
|
||||
q += author_tokens
|
||||
check_tokens |= set(to_check_tokens(*author_tokens))
|
||||
if not q:
|
||||
return None
|
||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||
se = search_engines_module()
|
||||
url = se.google_format_query(q, tbm='bks')
|
||||
log('Making query:', url)
|
||||
br = se.google_specialize_browser(se.browser())
|
||||
r = []
|
||||
root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)
|
||||
pat = re.compile(r'id=([^&]+)')
|
||||
google_ids = []
|
||||
for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):
|
||||
m = pat.search(q.url)
|
||||
if m is None:
|
||||
continue
|
||||
google_ids.append(m.group(1))
|
||||
|
||||
if not google_ids and isbn and (title or authors):
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
|
||||
found = False
|
||||
seen = set()
|
||||
for relevance, gid in enumerate(google_ids):
|
||||
if gid in seen:
|
||||
continue
|
||||
seen.add(gid)
|
||||
try:
|
||||
ans = to_metadata(br, log, gid, timeout)
|
||||
if isinstance(ans, Metadata):
|
||||
if isbn:
|
||||
if isbn not in ans.all_isbns:
|
||||
log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,
|
||||
'not in', ' '.join(ans.all_isbns))
|
||||
continue
|
||||
elif check_tokens:
|
||||
candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))
|
||||
candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))
|
||||
if candidate.intersection(check_tokens) != check_tokens:
|
||||
log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')
|
||||
continue
|
||||
ans.source_relevance = relevance
|
||||
goog = ans.identifiers['google']
|
||||
for isbn in getattr(ans, 'all_isbns', []):
|
||||
self.cache_isbn_to_identifier(isbn, goog)
|
||||
if getattr(ans, 'has_google_cover', False):
|
||||
self.cache_identifier_to_cover_url(
|
||||
goog, self.GOOGLE_COVER % goog
|
||||
)
|
||||
self.clean_downloaded_metadata(ans)
|
||||
result_queue.put(ans)
|
||||
found = True
|
||||
except:
|
||||
log.exception('Failed to get metadata for google books id:', gid)
|
||||
if abort.is_set():
|
||||
break
|
||||
if not found and isbn and (title or authors):
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
|
||||
|
||||
def identify( # {{{
|
||||
self,
|
||||
log,
|
||||
@ -365,6 +465,9 @@ class GoogleBooks(Source):
|
||||
identifiers={},
|
||||
timeout=30
|
||||
):
|
||||
if True:
|
||||
return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
|
||||
|
||||
from lxml import etree
|
||||
entry = XPath('//atom:entry')
|
||||
|
||||
@ -441,20 +544,7 @@ if __name__ == '__main__': # tests {{{
|
||||
)
|
||||
tests = [
|
||||
({
|
||||
'identifiers': {
|
||||
'isbn': '978-0-7869-5437-7' # needs capitalized ISBN to find results
|
||||
},
|
||||
'title': 'Dragons of Autumn Twilight',
|
||||
'authors': ['Margaret Weis', 'Tracy Hickman']
|
||||
}, [
|
||||
title_test('The great gatsby', exact=True),
|
||||
authors_test(['F. Scott Fitzgerald'])
|
||||
]),
|
||||
|
||||
({
|
||||
'identifiers': {
|
||||
'isbn': '0743273567'
|
||||
},
|
||||
'identifiers': {'isbn': '0743273567'},
|
||||
'title': 'Great Gatsby',
|
||||
'authors': ['Fitzgerald']
|
||||
}, [
|
||||
@ -470,7 +560,14 @@ if __name__ == '__main__': # tests {{{
|
||||
'The Blood Red Indian Summer: A Berger and Mitry Mystery',
|
||||
'authors': ['David Handler'],
|
||||
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')
|
||||
])
|
||||
]),
|
||||
({
|
||||
'identifiers': {'isbn': '9781618246509'},
|
||||
}, [
|
||||
title_test('The dragon done it', exact=True),
|
||||
authors_test(['Eric Flint', 'Mike Resnick'])
|
||||
]),
|
||||
|
||||
]
|
||||
test_identify_plugin(GoogleBooks.name, tests[:])
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user