Google metadata download: Fix metadata not being found when the title of the book includes a sub-title

This commit is contained in:
Kovid Goyal 2017-02-28 14:57:07 +05:30
parent 329cfcd61c
commit c31ff8f30e

View File

@ -5,7 +5,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import hashlib
import time
from functools import partial
from Queue import Empty, Queue
from calibre import as_unicode
@ -41,9 +40,19 @@ def get_details(browser, url, timeout): # {{{
# }}}
xpath_cache = {}
def XPath(x):
ans = xpath_cache.get(x)
if ans is None:
from lxml import etree
ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)
return ans
def to_metadata(browser, log, entry_, timeout): # {{{
from lxml import etree
XPath = partial(etree.XPath, namespaces=NAMESPACES)
# total_results = XPath('//openSearch:totalResults')
# start_index = XPath('//openSearch:startIndex')
@ -58,7 +67,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{
subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language')
rating = XPath('descendant::gd:rating[@average]')
# print(etree.tostring(entry_, pretty_print=True))
def get_text(extra, x):
@ -138,15 +146,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{
except:
log.error('Failed to parse pubdate %r' % pubdate)
# Ratings
for x in rating(extra):
try:
mi.rating = float(x.get('average'))
if mi.rating > 5:
mi.rating /= 2
except:
log.exception('Failed to parse rating')
# Cover
mi.has_google_cover = None
for x in extra.xpath(
@ -178,7 +177,8 @@ class GoogleBooks(Source):
GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
DUMMY_IMAGE_MD5 = frozenset({'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'})
DUMMY_IMAGE_MD5 = frozenset(
{'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'})
def get_book_url(self, identifiers): # {{{
goog = identifiers.get('google', None)
@ -202,7 +202,8 @@ class GoogleBooks(Source):
title_tokens = list(self.get_title_tokens(title))
if title_tokens:
q += build_term('title', title_tokens)
author_tokens = self.get_author_tokens(authors, only_first_author=True)
author_tokens = self.get_author_tokens(
authors, only_first_author=True)
if author_tokens:
q += ('+' if q else '') + build_term('author', author_tokens)
@ -322,7 +323,8 @@ class GoogleBooks(Source):
result_queue.put(ans)
except:
log.exception(
'Failed to get metadata for identify entry:', etree.tostring(i)
'Failed to get metadata for identify entry:', etree.tostring(
i)
)
if abort.is_set():
break
@ -340,7 +342,6 @@ class GoogleBooks(Source):
timeout=30
):
from lxml import etree
XPath = partial(etree.XPath, namespaces=NAMESPACES)
entry = XPath('//atom:entry')
query = self.create_query(
@ -350,7 +351,7 @@ class GoogleBooks(Source):
log.error('Insufficient metadata to construct query')
return
br = self.browser
self.log('Making query:', query)
log('Making query:', query)
try:
raw = br.open_novisit(query, timeout=timeout).read()
except Exception as e:
@ -360,7 +361,8 @@ class GoogleBooks(Source):
try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
xml_to_unicode(clean_ascii_chars(
raw), strip_encoding_pats=True)[0],
parser=parser
)
entries = entry(feed)
@ -368,16 +370,29 @@ class GoogleBooks(Source):
log.exception('Failed to parse identify results')
return as_unicode(e)
if not entries and identifiers and title and authors and \
not abort.is_set():
return self.identify(
log,
result_queue,
abort,
title=title,
authors=authors,
timeout=timeout
)
if not entries and title and not abort.is_set():
if identifiers:
log('No results found, retrying without identifiers')
return self.identify(
log,
result_queue,
abort,
title=title,
authors=authors,
timeout=timeout
)
if ':' in title:
title = title.partition(':')[0]
if title:
log('No results found, retrying without sub-title')
return self.identify(
log,
result_queue,
abort,
title=title,
authors=authors,
timeout=timeout
)
# There is no point running these queries in threads as google
# throttles requests returning 403 Forbidden errors
@ -387,27 +402,36 @@ class GoogleBooks(Source):
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/google.py
# To run these test use: calibre-debug
# src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (
test_identify_plugin, title_test, authors_test
)
test_identify_plugin(
GoogleBooks.name, [
({
'identifiers': {
'isbn': '0743273567'
},
'title': 'Great Gatsby',
'authors': ['Fitzgerald']
}, [
title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald'])
]),
({
'title': 'Flatland',
'authors': ['Abbott']
}, [title_test('Flatland', exact=False)]),
tests = [
({
'identifiers': {
'isbn': '0743273567'
},
'title': 'Great Gatsby',
'authors': ['Fitzgerald']
}, [
title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald'])
]
)
),
({
'title': 'Flatland',
'authors': ['Abbott']
}, [title_test('Flatland', exact=False)]
),
({
'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',
'authors': ['David Handler'],
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')]
)
]
test_identify_plugin(GoogleBooks.name, tests[:])
# }}}