This commit is contained in:
Kovid Goyal 2017-02-26 13:00:36 +05:30
parent 4f5155d190
commit 4ae11fa295

View File

@ -1,37 +1,34 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, # License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
print_function) from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3' import hashlib
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' import time
__docformat__ = 'restructuredtext en'
import time, hashlib
from functools import partial from functools import partial
from Queue import Queue, Empty from Queue import Empty, Queue
from calibre.ebooks.metadata import check_isbn from calibre import as_unicode
from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre import as_unicode
NAMESPACES = { NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom', 'atom': 'http://www.w3.org/2005/Atom',
'dc' : 'http://purl.org/dc/terms', 'dc': 'http://purl.org/dc/terms',
'gd' : 'http://schemas.google.com/g/2005' 'gd': 'http://schemas.google.com/g/2005'
} }
def get_details(browser, url, timeout): # {{{ def get_details(browser, url, timeout): # {{{
try: try:
raw = browser.open_novisit(url, timeout=timeout).read() raw = browser.open_novisit(url, timeout=timeout).read()
except Exception as e: except Exception as e:
gc = getattr(e, 'getcode', lambda : -1) gc = getattr(e, 'getcode', lambda: -1)
if gc() != 403: if gc() != 403:
raise raise
# Google is throttling us, wait a little # Google is throttling us, wait a little
@ -39,6 +36,8 @@ def get_details(browser, url, timeout): # {{{
raw = browser.open_novisit(url, timeout=timeout).read() raw = browser.open_novisit(url, timeout=timeout).read()
return raw return raw
# }}} # }}}
@ -49,17 +48,17 @@ def to_metadata(browser, log, entry_, timeout): # {{{
# total_results = XPath('//openSearch:totalResults') # total_results = XPath('//openSearch:totalResults')
# start_index = XPath('//openSearch:startIndex') # start_index = XPath('//openSearch:startIndex')
# items_per_page = XPath('//openSearch:itemsPerPage') # items_per_page = XPath('//openSearch:itemsPerPage')
entry = XPath('//atom:entry') entry = XPath('//atom:entry')
entry_id = XPath('descendant::atom:id') entry_id = XPath('descendant::atom:id')
creator = XPath('descendant::dc:creator') creator = XPath('descendant::dc:creator')
identifier = XPath('descendant::dc:identifier') identifier = XPath('descendant::dc:identifier')
title = XPath('descendant::dc:title') title = XPath('descendant::dc:title')
date = XPath('descendant::dc:date') date = XPath('descendant::dc:date')
publisher = XPath('descendant::dc:publisher') publisher = XPath('descendant::dc:publisher')
subject = XPath('descendant::dc:subject') subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description') description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language') language = XPath('descendant::dc:language')
rating = XPath('descendant::gd:rating[@average]') rating = XPath('descendant::gd:rating[@average]')
def get_text(extra, x): def get_text(extra, x):
try: try:
@ -83,11 +82,12 @@ def to_metadata(browser, log, entry_, timeout): # {{{
return None return None
mi = Metadata(title_, authors) mi = Metadata(title_, authors)
mi.identifiers = {'google':google_id} mi.identifiers = {'google': google_id}
try: try:
raw = get_details(browser, id_url, timeout) raw = get_details(browser, id_url, timeout)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), feed = etree.fromstring(
strip_encoding_pats=True)[0]) xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]
)
extra = entry(feed)[0] extra = entry(feed)[0]
except: except:
log.exception('Failed to get additional details for', mi.title) log.exception('Failed to get additional details for', mi.title)
@ -135,7 +135,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
default = utcnow().replace(day=15) default = utcnow().replace(day=15)
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
except: except:
log.error('Failed to parse pubdate %r'%pubdate) log.error('Failed to parse pubdate %r' % pubdate)
# Ratings # Ratings
for x in rating(extra): for x in rating(extra):
@ -149,11 +149,14 @@ def to_metadata(browser, log, entry_, timeout): # {{{
# Cover # Cover
mi.has_google_cover = None mi.has_google_cover = None
for x in extra.xpath( for x in extra.xpath(
'//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'
):
mi.has_google_cover = x.get('href') mi.has_google_cover = x.get('href')
break break
return mi return mi
# }}} # }}}
@ -162,21 +165,23 @@ class GoogleBooks(Source):
name = 'Google' name = 'Google'
description = _('Downloads metadata and covers from Google Books') description = _('Downloads metadata and covers from Google Books')
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset({'identify', 'cover'})
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', touched_fields = frozenset({
'comments', 'publisher', 'identifier:isbn', 'rating', 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
'identifier:google', 'languages']) 'identifier:isbn', 'rating', 'identifier:google', 'languages'
})
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
cached_cover_url_is_reliable = False cached_cover_url_is_reliable = False
GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1' GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) DUMMY_IMAGE_MD5 = frozenset({'0de4383ebad0adad5eeb8975cd796657'})
def get_book_url(self, identifiers): # {{{ def get_book_url(self, identifiers): # {{{
goog = identifiers.get('google', None) goog = identifiers.get('google', None)
if goog is not None: if goog is not None:
return ('google', goog, 'https://books.google.com/books?id=%s'%goog) return ('google', goog, 'https://books.google.com/books?id=%s' % goog)
# }}} # }}}
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
@ -185,39 +190,55 @@ class GoogleBooks(Source):
isbn = check_isbn(identifiers.get('isbn', None)) isbn = check_isbn(identifiers.get('isbn', None))
q = '' q = ''
if isbn is not None: if isbn is not None:
q += 'isbn:'+isbn q += 'isbn:' + isbn
elif title or authors: elif title or authors:
def build_term(prefix, parts): def build_term(prefix, parts):
return ' '.join('in'+prefix + ':' + x for x in parts) return ' '.join('in' + prefix + ':' + x for x in parts)
title_tokens = list(self.get_title_tokens(title)) title_tokens = list(self.get_title_tokens(title))
if title_tokens: if title_tokens:
q += build_term('title', title_tokens) q += build_term('title', title_tokens)
author_tokens = self.get_author_tokens(authors, author_tokens = self.get_author_tokens(authors, only_first_author=True)
only_first_author=True)
if author_tokens: if author_tokens:
q += ('+' if q else '') + build_term('author', q += ('+' if q else '') + build_term('author', author_tokens)
author_tokens)
if isinstance(q, unicode): if isinstance(q, unicode):
q = q.encode('utf-8') q = q.encode('utf-8')
if not q: if not q:
return None return None
return BASE_URL+urlencode({ return BASE_URL + urlencode({
'q':q, 'q': q,
'max-results':20, 'max-results': 20,
'start-index':1, 'start-index': 1,
'min-viewability':'none', 'min-viewability': 'none',
}) })
# }}} # }}}
def download_cover(self, log, result_queue, abort, # {{{ def download_cover( # {{{
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): self,
log,
result_queue,
abort,
title=None,
authors=None,
identifiers={},
timeout=30,
get_best_cover=False
):
cached_url = self.get_cached_cover_url(identifiers) cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None: if cached_url is None:
log.info('No cached cover found, running identify') log.info('No cached cover found, running identify')
rq = Queue() rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors, self.identify(
identifiers=identifiers) log,
rq,
abort,
title=title,
authors=authors,
identifiers=identifiers
)
if abort.is_set(): if abort.is_set():
return return
results = [] results = []
@ -226,8 +247,11 @@ class GoogleBooks(Source):
results.append(rq.get_nowait()) results.append(rq.get_nowait())
except Empty: except Empty:
break break
results.sort(key=self.identify_results_keygen( results.sort(
title=title, authors=authors, identifiers=identifiers)) key=self.identify_results_keygen(
title=title, authors=authors, identifiers=identifiers
)
)
for mi in results: for mi in results:
cached_url = self.get_cached_cover_url(mi.identifiers) cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None: if cached_url is not None:
@ -263,10 +287,18 @@ class GoogleBooks(Source):
url = self.cached_identifier_to_cover_url(goog) url = self.cached_identifier_to_cover_url(goog)
return url return url
# }}} # }}}
def get_all_details(self, br, log, entries, abort, # {{{ def get_all_details( # {{{
result_queue, timeout): self,
br,
log,
entries,
abort,
result_queue,
timeout
):
from lxml import etree from lxml import etree
for relevance, i in enumerate(entries): for relevance, i in enumerate(entries):
try: try:
@ -277,26 +309,37 @@ class GoogleBooks(Source):
for isbn in getattr(ans, 'all_isbns', []): for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, goog) self.cache_isbn_to_identifier(isbn, goog)
if getattr(ans, 'has_google_cover', False): if getattr(ans, 'has_google_cover', False):
self.cache_identifier_to_cover_url(goog, self.cache_identifier_to_cover_url(
self.GOOGLE_COVER%goog) goog, self.GOOGLE_COVER % goog
)
self.clean_downloaded_metadata(ans) self.clean_downloaded_metadata(ans)
result_queue.put(ans) result_queue.put(ans)
except: except:
log.exception( log.exception(
'Failed to get metadata for identify entry:', 'Failed to get metadata for identify entry:', etree.tostring(i)
etree.tostring(i)) )
if abort.is_set(): if abort.is_set():
break break
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify( # {{{
identifiers={}, timeout=30): self,
log,
result_queue,
abort,
title=None,
authors=None,
identifiers={},
timeout=30
):
from lxml import etree from lxml import etree
XPath = partial(etree.XPath, namespaces=NAMESPACES) XPath = partial(etree.XPath, namespaces=NAMESPACES)
entry = XPath('//atom:entry') entry = XPath('//atom:entry')
query = self.create_query(log, title=title, authors=authors, query = self.create_query(
identifiers=identifiers) log, title=title, authors=authors, identifiers=identifiers
)
if not query: if not query:
log.error('Insufficient metadata to construct query') log.error('Insufficient metadata to construct query')
return return
@ -304,13 +347,15 @@ class GoogleBooks(Source):
try: try:
raw = br.open_novisit(query, timeout=timeout).read() raw = br.open_novisit(query, timeout=timeout).read()
except Exception as e: except Exception as e:
log.exception('Failed to make identify query: %r'%query) log.exception('Failed to make identify query: %r' % query)
return as_unicode(e) return as_unicode(e)
try: try:
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), feed = etree.fromstring(
strip_encoding_pats=True)[0], parser=parser) xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=parser
)
entries = entry(feed) entries = entry(feed)
except Exception as e: except Exception as e:
log.exception('Failed to parse identify results') log.exception('Failed to parse identify results')
@ -318,34 +363,45 @@ class GoogleBooks(Source):
if not entries and identifiers and title and authors and \ if not entries and identifiers and title and authors and \
not abort.is_set(): not abort.is_set():
return self.identify(log, result_queue, abort, title=title, return self.identify(
authors=authors, timeout=timeout) log,
result_queue,
abort,
title=title,
authors=authors,
timeout=timeout
)
# There is no point running these queries in threads as google # There is no point running these queries in threads as google
# throttles requests returning 403 Forbidden errors # throttles requests returning 403 Forbidden errors
self.get_all_details(br, log, entries, abort, result_queue, timeout) self.get_all_details(br, log, entries, abort, result_queue, timeout)
return None return None
# }}} # }}}
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (
title_test, authors_test) test_identify_plugin, title_test, authors_test
test_identify_plugin(GoogleBooks.name, )
[ test_identify_plugin(
GoogleBooks.name, [
({
( 'identifiers': {
{'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby', 'isbn': '0743273567'
'authors':['Fitzgerald']}, },
[title_test('The great gatsby', exact=True), 'title': 'Great Gatsby',
authors_test(['F. Scott Fitzgerald'])] 'authors': ['Fitzgerald']
), }, [
title_test('The great gatsby', exact=True),
( authors_test(['F. Scott Fitzgerald'])
{'title': 'Flatland', 'authors':['Abbott']}, ]),
[title_test('Flatland', exact=False)] ({
), 'title': 'Flatland',
]) 'authors': ['Abbott']
}, [title_test('Flatland', exact=False)]),
]
)
# }}} # }}}