mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Metadata download from Google: Switch to making the queries via a Google web search as this finds some books that searching via the API does not. Fixes #1982502 [Google metadata download suboptimal search](https://bugs.launchpad.net/calibre/+bug/1982502)
This commit is contained in:
parent
25fb58adfe
commit
d19150f852
@ -13,7 +13,7 @@ except ImportError:
|
|||||||
|
|
||||||
from calibre import as_unicode
|
from calibre import as_unicode
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn, authors_to_string
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
@ -91,30 +91,39 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
log.exception('Programming error:')
|
log.exception('Programming error:')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
id_url = entry_id(entry_)[0].text
|
def get_extra_details():
|
||||||
google_id = id_url.split('/')[-1]
|
|
||||||
details_url = url(entry_)[0]
|
|
||||||
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
|
||||||
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
|
||||||
if not authors:
|
|
||||||
authors = [_('Unknown')]
|
|
||||||
if not id_url or not title:
|
|
||||||
# Silently discard this entry
|
|
||||||
return None
|
|
||||||
|
|
||||||
mi = Metadata(title_, authors)
|
|
||||||
mi.identifiers = {'google': google_id}
|
|
||||||
try:
|
|
||||||
raw = get_details(browser, details_url, timeout)
|
raw = get_details(browser, details_url, timeout)
|
||||||
feed = etree.fromstring(
|
feed = etree.fromstring(
|
||||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||||
)
|
)
|
||||||
extra = entry(feed)[0]
|
return entry(feed)[0]
|
||||||
except:
|
|
||||||
log.exception('Failed to get additional details for', mi.title)
|
|
||||||
return mi
|
|
||||||
|
|
||||||
|
if isinstance(entry_, str):
|
||||||
|
google_id = entry_
|
||||||
|
details_url = 'https://www.google.com/books/feeds/volumes/' + google_id
|
||||||
|
extra = get_extra_details()
|
||||||
|
title_ = ': '.join([x.text for x in title(extra)]).strip()
|
||||||
|
authors = [x.text.strip() for x in creator(extra) if x.text]
|
||||||
|
else:
|
||||||
|
id_url = entry_id(entry_)[0].text
|
||||||
|
google_id = id_url.split('/')[-1]
|
||||||
|
details_url = url(entry_)[0]
|
||||||
|
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||||
|
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||||
|
if not id_url or not title:
|
||||||
|
# Silently discard this entry
|
||||||
|
return None
|
||||||
|
extra = None
|
||||||
|
|
||||||
|
if not authors:
|
||||||
|
authors = [_('Unknown')]
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
if extra is None:
|
||||||
|
extra = get_extra_details()
|
||||||
|
mi = Metadata(title_, authors)
|
||||||
|
mi.identifiers = {'google': google_id}
|
||||||
mi.comments = get_text(extra, description)
|
mi.comments = get_text(extra, description)
|
||||||
lang = canonicalize_lang(get_text(extra, language))
|
lang = canonicalize_lang(get_text(extra, language))
|
||||||
if lang:
|
if lang:
|
||||||
@ -176,7 +185,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
class GoogleBooks(Source):
|
class GoogleBooks(Source):
|
||||||
|
|
||||||
name = 'Google'
|
name = 'Google'
|
||||||
version = (1, 0, 3)
|
version = (1, 0, 4)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
description = _('Downloads metadata and covers from Google Books')
|
description = _('Downloads metadata and covers from Google Books')
|
||||||
|
|
||||||
@ -355,6 +364,97 @@ class GoogleBooks(Source):
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
def identify_via_web_search( # {{{
|
||||||
|
self,
|
||||||
|
log,
|
||||||
|
result_queue,
|
||||||
|
abort,
|
||||||
|
title=None,
|
||||||
|
authors=None,
|
||||||
|
identifiers={},
|
||||||
|
timeout=30
|
||||||
|
):
|
||||||
|
isbn = check_isbn(identifiers.get('isbn', None))
|
||||||
|
q = []
|
||||||
|
|
||||||
|
def to_check_tokens(*tokens):
|
||||||
|
for t in tokens:
|
||||||
|
if len(t) < 3:
|
||||||
|
continue
|
||||||
|
t = t.lower()
|
||||||
|
if t in ('and', 'not', 'the'):
|
||||||
|
continue
|
||||||
|
yield t.strip(':')
|
||||||
|
|
||||||
|
check_tokens = set()
|
||||||
|
if isbn is not None:
|
||||||
|
q.append(isbn)
|
||||||
|
elif title or authors:
|
||||||
|
title_tokens = list(self.get_title_tokens(title))
|
||||||
|
if title_tokens:
|
||||||
|
q += title_tokens
|
||||||
|
check_tokens |= set(to_check_tokens(*title_tokens))
|
||||||
|
author_tokens = list(self.get_author_tokens(authors, only_first_author=True))
|
||||||
|
if author_tokens:
|
||||||
|
q += author_tokens
|
||||||
|
check_tokens |= set(to_check_tokens(*author_tokens))
|
||||||
|
if not q:
|
||||||
|
return None
|
||||||
|
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||||
|
se = search_engines_module()
|
||||||
|
url = se.google_format_query(q, tbm='bks')
|
||||||
|
log('Making query:', url)
|
||||||
|
br = se.google_specialize_browser(se.browser())
|
||||||
|
r = []
|
||||||
|
root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)
|
||||||
|
pat = re.compile(r'id=([^&]+)')
|
||||||
|
google_ids = []
|
||||||
|
for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):
|
||||||
|
m = pat.search(q.url)
|
||||||
|
if m is None:
|
||||||
|
continue
|
||||||
|
google_ids.append(m.group(1))
|
||||||
|
|
||||||
|
if not google_ids and isbn and (title or authors):
|
||||||
|
return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
|
||||||
|
found = False
|
||||||
|
seen = set()
|
||||||
|
for relevance, gid in enumerate(google_ids):
|
||||||
|
if gid in seen:
|
||||||
|
continue
|
||||||
|
seen.add(gid)
|
||||||
|
try:
|
||||||
|
ans = to_metadata(br, log, gid, timeout)
|
||||||
|
if isinstance(ans, Metadata):
|
||||||
|
if isbn:
|
||||||
|
if isbn not in ans.all_isbns:
|
||||||
|
log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,
|
||||||
|
'not in', ' '.join(ans.all_isbns))
|
||||||
|
continue
|
||||||
|
elif check_tokens:
|
||||||
|
candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))
|
||||||
|
candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))
|
||||||
|
if candidate.intersection(check_tokens) != check_tokens:
|
||||||
|
log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')
|
||||||
|
continue
|
||||||
|
ans.source_relevance = relevance
|
||||||
|
goog = ans.identifiers['google']
|
||||||
|
for isbn in getattr(ans, 'all_isbns', []):
|
||||||
|
self.cache_isbn_to_identifier(isbn, goog)
|
||||||
|
if getattr(ans, 'has_google_cover', False):
|
||||||
|
self.cache_identifier_to_cover_url(
|
||||||
|
goog, self.GOOGLE_COVER % goog
|
||||||
|
)
|
||||||
|
self.clean_downloaded_metadata(ans)
|
||||||
|
result_queue.put(ans)
|
||||||
|
found = True
|
||||||
|
except:
|
||||||
|
log.exception('Failed to get metadata for google books id:', gid)
|
||||||
|
if abort.is_set():
|
||||||
|
break
|
||||||
|
if not found and isbn and (title or authors):
|
||||||
|
return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)
|
||||||
|
|
||||||
def identify( # {{{
|
def identify( # {{{
|
||||||
self,
|
self,
|
||||||
log,
|
log,
|
||||||
@ -365,6 +465,9 @@ class GoogleBooks(Source):
|
|||||||
identifiers={},
|
identifiers={},
|
||||||
timeout=30
|
timeout=30
|
||||||
):
|
):
|
||||||
|
if True:
|
||||||
|
return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
entry = XPath('//atom:entry')
|
entry = XPath('//atom:entry')
|
||||||
|
|
||||||
@ -441,20 +544,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
)
|
)
|
||||||
tests = [
|
tests = [
|
||||||
({
|
({
|
||||||
'identifiers': {
|
'identifiers': {'isbn': '0743273567'},
|
||||||
'isbn': '978-0-7869-5437-7' # needs capitalized ISBN to find results
|
|
||||||
},
|
|
||||||
'title': 'Dragons of Autumn Twilight',
|
|
||||||
'authors': ['Margaret Weis', 'Tracy Hickman']
|
|
||||||
}, [
|
|
||||||
title_test('The great gatsby', exact=True),
|
|
||||||
authors_test(['F. Scott Fitzgerald'])
|
|
||||||
]),
|
|
||||||
|
|
||||||
({
|
|
||||||
'identifiers': {
|
|
||||||
'isbn': '0743273567'
|
|
||||||
},
|
|
||||||
'title': 'Great Gatsby',
|
'title': 'Great Gatsby',
|
||||||
'authors': ['Fitzgerald']
|
'authors': ['Fitzgerald']
|
||||||
}, [
|
}, [
|
||||||
@ -470,7 +560,14 @@ if __name__ == '__main__': # tests {{{
|
|||||||
'The Blood Red Indian Summer: A Berger and Mitry Mystery',
|
'The Blood Red Indian Summer: A Berger and Mitry Mystery',
|
||||||
'authors': ['David Handler'],
|
'authors': ['David Handler'],
|
||||||
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')
|
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')
|
||||||
])
|
]),
|
||||||
|
({
|
||||||
|
'identifiers': {'isbn': '9781618246509'},
|
||||||
|
}, [
|
||||||
|
title_test('The dragon done it', exact=True),
|
||||||
|
authors_test(['Eric Flint', 'Mike Resnick'])
|
||||||
|
]),
|
||||||
|
|
||||||
]
|
]
|
||||||
test_identify_plugin(GoogleBooks.name, tests[:])
|
test_identify_plugin(GoogleBooks.name, tests[:])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user