mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Google metadata download: Fix metadata not being found when the title of the book includes a sub-title
This commit is contained in:
parent
329cfcd61c
commit
c31ff8f30e
@ -5,7 +5,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
from functools import partial
|
|
||||||
from Queue import Empty, Queue
|
from Queue import Empty, Queue
|
||||||
|
|
||||||
from calibre import as_unicode
|
from calibre import as_unicode
|
||||||
@ -41,9 +40,19 @@ def get_details(browser, url, timeout): # {{{
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
xpath_cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def XPath(x):
|
||||||
|
ans = xpath_cache.get(x)
|
||||||
|
if ans is None:
|
||||||
|
from lxml import etree
|
||||||
|
ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def to_metadata(browser, log, entry_, timeout): # {{{
|
def to_metadata(browser, log, entry_, timeout): # {{{
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
|
||||||
|
|
||||||
# total_results = XPath('//openSearch:totalResults')
|
# total_results = XPath('//openSearch:totalResults')
|
||||||
# start_index = XPath('//openSearch:startIndex')
|
# start_index = XPath('//openSearch:startIndex')
|
||||||
@ -58,7 +67,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
subject = XPath('descendant::dc:subject')
|
subject = XPath('descendant::dc:subject')
|
||||||
description = XPath('descendant::dc:description')
|
description = XPath('descendant::dc:description')
|
||||||
language = XPath('descendant::dc:language')
|
language = XPath('descendant::dc:language')
|
||||||
rating = XPath('descendant::gd:rating[@average]')
|
|
||||||
# print(etree.tostring(entry_, pretty_print=True))
|
# print(etree.tostring(entry_, pretty_print=True))
|
||||||
|
|
||||||
def get_text(extra, x):
|
def get_text(extra, x):
|
||||||
@ -138,15 +146,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
except:
|
except:
|
||||||
log.error('Failed to parse pubdate %r' % pubdate)
|
log.error('Failed to parse pubdate %r' % pubdate)
|
||||||
|
|
||||||
# Ratings
|
|
||||||
for x in rating(extra):
|
|
||||||
try:
|
|
||||||
mi.rating = float(x.get('average'))
|
|
||||||
if mi.rating > 5:
|
|
||||||
mi.rating /= 2
|
|
||||||
except:
|
|
||||||
log.exception('Failed to parse rating')
|
|
||||||
|
|
||||||
# Cover
|
# Cover
|
||||||
mi.has_google_cover = None
|
mi.has_google_cover = None
|
||||||
for x in extra.xpath(
|
for x in extra.xpath(
|
||||||
@ -178,7 +177,8 @@ class GoogleBooks(Source):
|
|||||||
|
|
||||||
GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
|
GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
|
||||||
|
|
||||||
DUMMY_IMAGE_MD5 = frozenset({'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'})
|
DUMMY_IMAGE_MD5 = frozenset(
|
||||||
|
{'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'})
|
||||||
|
|
||||||
def get_book_url(self, identifiers): # {{{
|
def get_book_url(self, identifiers): # {{{
|
||||||
goog = identifiers.get('google', None)
|
goog = identifiers.get('google', None)
|
||||||
@ -202,7 +202,8 @@ class GoogleBooks(Source):
|
|||||||
title_tokens = list(self.get_title_tokens(title))
|
title_tokens = list(self.get_title_tokens(title))
|
||||||
if title_tokens:
|
if title_tokens:
|
||||||
q += build_term('title', title_tokens)
|
q += build_term('title', title_tokens)
|
||||||
author_tokens = self.get_author_tokens(authors, only_first_author=True)
|
author_tokens = self.get_author_tokens(
|
||||||
|
authors, only_first_author=True)
|
||||||
if author_tokens:
|
if author_tokens:
|
||||||
q += ('+' if q else '') + build_term('author', author_tokens)
|
q += ('+' if q else '') + build_term('author', author_tokens)
|
||||||
|
|
||||||
@ -322,7 +323,8 @@ class GoogleBooks(Source):
|
|||||||
result_queue.put(ans)
|
result_queue.put(ans)
|
||||||
except:
|
except:
|
||||||
log.exception(
|
log.exception(
|
||||||
'Failed to get metadata for identify entry:', etree.tostring(i)
|
'Failed to get metadata for identify entry:', etree.tostring(
|
||||||
|
i)
|
||||||
)
|
)
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
break
|
break
|
||||||
@ -340,7 +342,6 @@ class GoogleBooks(Source):
|
|||||||
timeout=30
|
timeout=30
|
||||||
):
|
):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
|
||||||
entry = XPath('//atom:entry')
|
entry = XPath('//atom:entry')
|
||||||
|
|
||||||
query = self.create_query(
|
query = self.create_query(
|
||||||
@ -350,7 +351,7 @@ class GoogleBooks(Source):
|
|||||||
log.error('Insufficient metadata to construct query')
|
log.error('Insufficient metadata to construct query')
|
||||||
return
|
return
|
||||||
br = self.browser
|
br = self.browser
|
||||||
self.log('Making query:', query)
|
log('Making query:', query)
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(query, timeout=timeout).read()
|
raw = br.open_novisit(query, timeout=timeout).read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -360,7 +361,8 @@ class GoogleBooks(Source):
|
|||||||
try:
|
try:
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
feed = etree.fromstring(
|
feed = etree.fromstring(
|
||||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
xml_to_unicode(clean_ascii_chars(
|
||||||
|
raw), strip_encoding_pats=True)[0],
|
||||||
parser=parser
|
parser=parser
|
||||||
)
|
)
|
||||||
entries = entry(feed)
|
entries = entry(feed)
|
||||||
@ -368,16 +370,29 @@ class GoogleBooks(Source):
|
|||||||
log.exception('Failed to parse identify results')
|
log.exception('Failed to parse identify results')
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
if not entries and identifiers and title and authors and \
|
if not entries and title and not abort.is_set():
|
||||||
not abort.is_set():
|
if identifiers:
|
||||||
return self.identify(
|
log('No results found, retrying without identifiers')
|
||||||
log,
|
return self.identify(
|
||||||
result_queue,
|
log,
|
||||||
abort,
|
result_queue,
|
||||||
title=title,
|
abort,
|
||||||
authors=authors,
|
title=title,
|
||||||
timeout=timeout
|
authors=authors,
|
||||||
)
|
timeout=timeout
|
||||||
|
)
|
||||||
|
if ':' in title:
|
||||||
|
title = title.partition(':')[0]
|
||||||
|
if title:
|
||||||
|
log('No results found, retrying without sub-title')
|
||||||
|
return self.identify(
|
||||||
|
log,
|
||||||
|
result_queue,
|
||||||
|
abort,
|
||||||
|
title=title,
|
||||||
|
authors=authors,
|
||||||
|
timeout=timeout
|
||||||
|
)
|
||||||
|
|
||||||
# There is no point running these queries in threads as google
|
# There is no point running these queries in threads as google
|
||||||
# throttles requests returning 403 Forbidden errors
|
# throttles requests returning 403 Forbidden errors
|
||||||
@ -387,27 +402,36 @@ class GoogleBooks(Source):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/google.py
|
# To run these test use: calibre-debug
|
||||||
|
# src/calibre/ebooks/metadata/sources/google.py
|
||||||
from calibre.ebooks.metadata.sources.test import (
|
from calibre.ebooks.metadata.sources.test import (
|
||||||
test_identify_plugin, title_test, authors_test
|
test_identify_plugin, title_test, authors_test
|
||||||
)
|
)
|
||||||
test_identify_plugin(
|
tests = [
|
||||||
GoogleBooks.name, [
|
({
|
||||||
({
|
'identifiers': {
|
||||||
'identifiers': {
|
'isbn': '0743273567'
|
||||||
'isbn': '0743273567'
|
},
|
||||||
},
|
'title': 'Great Gatsby',
|
||||||
'title': 'Great Gatsby',
|
'authors': ['Fitzgerald']
|
||||||
'authors': ['Fitzgerald']
|
}, [
|
||||||
}, [
|
title_test('The great gatsby', exact=True),
|
||||||
title_test('The great gatsby', exact=True),
|
authors_test(['F. Scott Fitzgerald'])
|
||||||
authors_test(['F. Scott Fitzgerald'])
|
|
||||||
]),
|
|
||||||
({
|
|
||||||
'title': 'Flatland',
|
|
||||||
'authors': ['Abbott']
|
|
||||||
}, [title_test('Flatland', exact=False)]),
|
|
||||||
]
|
]
|
||||||
)
|
),
|
||||||
|
|
||||||
|
({
|
||||||
|
'title': 'Flatland',
|
||||||
|
'authors': ['Abbott']
|
||||||
|
}, [title_test('Flatland', exact=False)]
|
||||||
|
),
|
||||||
|
|
||||||
|
({
|
||||||
|
'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',
|
||||||
|
'authors': ['David Handler'],
|
||||||
|
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
test_identify_plugin(GoogleBooks.name, tests[:])
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user