From c31ff8f30e229995ff199dac859b8d6829986fab Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 28 Feb 2017 14:57:07 +0530 Subject: [PATCH] Google metadata download: Fix metadata not being found when the title of the book includes a sub-title --- src/calibre/ebooks/metadata/sources/google.py | 116 +++++++++++------- 1 file changed, 70 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 8a03ccd96e..4f055c8ab0 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -5,7 +5,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera import hashlib import time -from functools import partial from Queue import Empty, Queue from calibre import as_unicode @@ -41,9 +40,19 @@ def get_details(browser, url, timeout): # {{{ # }}} +xpath_cache = {} + + +def XPath(x): + ans = xpath_cache.get(x) + if ans is None: + from lxml import etree + ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES) + return ans + + def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree - XPath = partial(etree.XPath, namespaces=NAMESPACES) # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') @@ -58,7 +67,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{ subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') - rating = XPath('descendant::gd:rating[@average]') # print(etree.tostring(entry_, pretty_print=True)) def get_text(extra, x): @@ -138,15 +146,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{ except: log.error('Failed to parse pubdate %r' % pubdate) - # Ratings - for x in rating(extra): - try: - mi.rating = float(x.get('average')) - if mi.rating > 5: - mi.rating /= 2 - except: - log.exception('Failed to parse rating') - # Cover mi.has_google_cover = None for x in extra.xpath( @@ -178,7 +177,8 @@ class GoogleBooks(Source): GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1' - DUMMY_IMAGE_MD5 = frozenset({'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'}) + DUMMY_IMAGE_MD5 = frozenset( + {'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'}) def get_book_url(self, identifiers): # {{{ goog = identifiers.get('google', None) @@ -202,7 +202,8 @@ class GoogleBooks(Source): title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) - author_tokens = self.get_author_tokens(authors, only_first_author=True) + author_tokens = self.get_author_tokens( + authors, only_first_author=True) if author_tokens: q += ('+' if q else '') + build_term('author', author_tokens) @@ -322,7 +323,8 @@ class GoogleBooks(Source): result_queue.put(ans) except: log.exception( - 'Failed to get metadata for identify entry:', etree.tostring(i) + 'Failed to get metadata for identify entry:', etree.tostring( + i) ) if abort.is_set(): break @@ -340,7 +342,6 @@ class GoogleBooks(Source): timeout=30 ): from lxml import etree - XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') query = self.create_query( @@ -350,7 +351,7 @@ class GoogleBooks(Source): log.error('Insufficient metadata to construct query') return br = self.browser - self.log('Making query:', query) + log('Making query:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: @@ -360,7 +361,8 @@ class GoogleBooks(Source): try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring( - xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], + xml_to_unicode(clean_ascii_chars( + raw), strip_encoding_pats=True)[0], parser=parser ) entries = entry(feed) @@ -368,16 +370,29 @@ class GoogleBooks(Source): log.exception('Failed to parse identify results') return as_unicode(e) - if not entries and identifiers and title and authors and \ - not abort.is_set(): - return self.identify( - log, - result_queue, - abort, - title=title, - authors=authors, - timeout=timeout - ) + if not entries and title and not abort.is_set(): + if identifiers: + log('No results found, retrying without identifiers') + return self.identify( + log, + result_queue, + abort, + title=title, + authors=authors, + timeout=timeout + ) + if ':' in title: + title = title.partition(':')[0] + if title: + log('No results found, retrying without sub-title') + return self.identify( + log, + result_queue, + abort, + title=title, + authors=authors, + timeout=timeout + ) # There is no point running these queries in threads as google # throttles requests returning 403 Forbidden errors @@ -387,27 +402,36 @@ class GoogleBooks(Source): if __name__ == '__main__': # tests {{{ - # To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/google.py + # To run these test use: calibre-debug + # src/calibre/ebooks/metadata/sources/google.py from calibre.ebooks.metadata.sources.test import ( test_identify_plugin, title_test, authors_test ) - test_identify_plugin( - GoogleBooks.name, [ - ({ - 'identifiers': { - 'isbn': '0743273567' - }, - 'title': 'Great Gatsby', - 'authors': ['Fitzgerald'] - }, [ - title_test('The great gatsby', exact=True), - authors_test(['F. Scott Fitzgerald']) - ]), - ({ - 'title': 'Flatland', - 'authors': ['Abbott'] - }, [title_test('Flatland', exact=False)]), + tests = [ + ({ + 'identifiers': { + 'isbn': '0743273567' + }, + 'title': 'Great Gatsby', + 'authors': ['Fitzgerald'] + }, [ + title_test('The great gatsby', exact=True), + authors_test(['F. Scott Fitzgerald']) ] - ) + ), + + ({ + 'title': 'Flatland', + 'authors': ['Abbott'] + }, [title_test('Flatland', exact=False)] + ), + + ({ + 'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery', + 'authors': ['David Handler'], + }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')] + ) + ] + test_identify_plugin(GoogleBooks.name, tests[:]) # }}}