Also strip text in parentheses automatically from title when retrying google books metadata query

This commit is contained in:
Kovid Goyal 2017-03-10 09:12:53 +05:30
parent 932884c6c4
commit d19c60f61d

View File

@ -4,6 +4,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import hashlib import hashlib
import re
import time import time
from Queue import Empty, Queue from Queue import Empty, Queue
@ -39,7 +40,6 @@ def get_details(browser, url, timeout): # {{{
# }}} # }}}
xpath_cache = {} xpath_cache = {}
@ -51,6 +51,12 @@ def XPath(x):
return ans return ans
def cleanup_title(title):
if ':' in title:
return title.partition(':')[0]
return re.sub(r'(.+?) \(.+\)', r'\1', title)
def to_metadata(browser, log, entry_, timeout): # {{{ def to_metadata(browser, log, entry_, timeout): # {{{
from lxml import etree from lxml import etree
@ -67,6 +73,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
subject = XPath('descendant::dc:subject') subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description') description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language') language = XPath('descendant::dc:language')
# print(etree.tostring(entry_, pretty_print=True)) # print(etree.tostring(entry_, pretty_print=True))
def get_text(extra, x): def get_text(extra, x):
@ -178,7 +185,8 @@ class GoogleBooks(Source):
GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1' GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
DUMMY_IMAGE_MD5 = frozenset( DUMMY_IMAGE_MD5 = frozenset(
{'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'}) {'0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f'}
)
def get_book_url(self, identifiers): # {{{ def get_book_url(self, identifiers): # {{{
goog = identifiers.get('google', None) goog = identifiers.get('google', None)
@ -202,8 +210,7 @@ class GoogleBooks(Source):
title_tokens = list(self.get_title_tokens(title)) title_tokens = list(self.get_title_tokens(title))
if title_tokens: if title_tokens:
q += build_term('title', title_tokens) q += build_term('title', title_tokens)
author_tokens = self.get_author_tokens( author_tokens = self.get_author_tokens(authors, only_first_author=True)
authors, only_first_author=True)
if author_tokens: if author_tokens:
q += ('+' if q else '') + build_term('author', author_tokens) q += ('+' if q else '') + build_term('author', author_tokens)
@ -323,8 +330,7 @@ class GoogleBooks(Source):
result_queue.put(ans) result_queue.put(ans)
except: except:
log.exception( log.exception(
'Failed to get metadata for identify entry:', etree.tostring( 'Failed to get metadata for identify entry:', etree.tostring(i)
i)
) )
if abort.is_set(): if abort.is_set():
break break
@ -361,8 +367,7 @@ class GoogleBooks(Source):
try: try:
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring( feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
raw), strip_encoding_pats=True)[0],
parser=parser parser=parser
) )
entries = entry(feed) entries = entry(feed)
@ -381,15 +386,14 @@ class GoogleBooks(Source):
authors=authors, authors=authors,
timeout=timeout timeout=timeout
) )
if ':' in title: ntitle = cleanup_title(title)
title = title.partition(':')[0] if ntitle and ntitle != title:
if title:
log('No results found, retrying without sub-title') log('No results found, retrying without sub-title')
return self.identify( return self.identify(
log, log,
result_queue, result_queue,
abort, abort,
title=title, title=ntitle,
authors=authors, authors=authors,
timeout=timeout timeout=timeout
) )
@ -407,8 +411,7 @@ if __name__ == '__main__': # tests {{{
from calibre.ebooks.metadata.sources.test import ( from calibre.ebooks.metadata.sources.test import (
test_identify_plugin, title_test, authors_test test_identify_plugin, title_test, authors_test
) )
tests = [ tests = [({
({
'identifiers': { 'identifiers': {
'isbn': '0743273567' 'isbn': '0743273567'
}, },
@ -417,21 +420,14 @@ if __name__ == '__main__': # tests {{{
}, [ }, [
title_test('The great gatsby', exact=True), title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald']) authors_test(['F. Scott Fitzgerald'])
] ]), ({
),
({
'title': 'Flatland', 'title': 'Flatland',
'authors': ['Abbott'] 'authors': ['Abbott']
}, [title_test('Flatland', exact=False)] }, [title_test('Flatland', exact=False)]), ({
), 'title':
'The Blood Red Indian Summer: A Berger and Mitry Mystery',
({
'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',
'authors': ['David Handler'], 'authors': ['David Handler'],
}, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')] }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')])]
)
]
test_identify_plugin(GoogleBooks.name, tests[:]) test_identify_plugin(GoogleBooks.name, tests[:])
# }}} # }}}