caching of google identifiers and logic to get cover url from google identifier

This commit is contained in:
Kovid Goyal 2011-02-22 20:34:06 -07:00
parent 6460f08b7f
commit 52c19d7b9b
2 changed files with 62 additions and 20 deletions

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re, threading
from calibre.customize import Plugin from calibre.customize import Plugin
from calibre.utils.logging import ThreadSafeLog, FileStream from calibre.utils.logging import ThreadSafeLog, FileStream
@ -30,7 +30,21 @@ class Source(Plugin):
touched_fields = frozenset() touched_fields = frozenset()
def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs)
self._isbn_to_identifier_cache = {}
self.cache_lock = threading.RLock()
# Utility functions {{{ # Utility functions {{{
def cache_isbn_to_identifier(self, isbn, identifier):
with self.cache_lock:
self._isbn_to_identifier_cache[isbn] = identifier
def cached_isbn_to_identifier(self, isbn):
with self.cache_lock:
return self._isbn_to_identifier_cache.get(isbn, None)
def get_author_tokens(self, authors, only_first_author=True): def get_author_tokens(self, authors, only_first_author=True):
''' '''
Take a list of authors and return a list of tokens useful for an Take a list of authors and return a list of tokens useful for an

View File

@ -13,6 +13,7 @@ from functools import partial
from lxml import etree from lxml import etree
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -69,6 +70,7 @@ def to_metadata(browser, log, entry_, timeout):
id_url = entry_id(entry_)[0].text id_url = entry_id(entry_)[0].text
google_id = id_url.split('/')[-1]
title_ = ': '.join([x.text for x in title(entry_)]).strip() title_ = ': '.join([x.text for x in title(entry_)]).strip()
authors = [x.text.strip() for x in creator(entry_) if x.text] authors = [x.text.strip() for x in creator(entry_) if x.text]
if not authors: if not authors:
@ -78,6 +80,7 @@ def to_metadata(browser, log, entry_, timeout):
return None return None
mi = Metadata(title_, authors) mi = Metadata(title_, authors)
mi.identifiers = {'google':google_id}
try: try:
raw = get_details(browser, id_url, timeout) raw = get_details(browser, id_url, timeout)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@ -103,9 +106,12 @@ def to_metadata(browser, log, entry_, timeout):
t = str(x.text).strip() t = str(x.text).strip()
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
if t[:5].upper() == 'ISBN:': if t[:5].upper() == 'ISBN:':
isbns.append(t[5:]) t = check_isbn(t[5:])
if t:
isbns.append(t)
if isbns: if isbns:
mi.isbn = sorted(isbns, key=len)[-1] mi.isbn = sorted(isbns, key=len)[-1]
mi.all_isbns = isbns
# Tags # Tags
try: try:
@ -133,20 +139,6 @@ def to_metadata(browser, log, entry_, timeout):
return mi return mi
def get_all_details(br, log, entries, abort, result_queue, timeout):
for i in entries:
try:
ans = to_metadata(br, log, i, timeout)
if isinstance(ans, Metadata):
result_queue.put(ans)
except:
log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
if abort.is_set():
break
class GoogleBooks(Source): class GoogleBooks(Source):
name = 'Google Books' name = 'Google Books'
@ -185,6 +177,36 @@ class GoogleBooks(Source):
'min-viewability':'none', 'min-viewability':'none',
}) })
def cover_url_from_identifiers(self, identifiers):
goog = identifiers.get('google', None)
if goog is None:
isbn = identifiers.get('isbn', None)
goog = self.cached_isbn_to_identifier(isbn)
if goog is not None:
return ('http://books.google.com/books?id=%s&printsec=frontcover&img=1' %
goog)
def is_cover_image_valid(self, raw):
# When no cover is present, returns a PNG saying image not available
# Try for example google identifier llNqPwAACAAJ
# I have yet to see an actual cover in PNG format
return raw and len(raw) > 17000 and raw[1:4] != 'PNG'
def get_all_details(self, br, log, entries, abort, result_queue, timeout):
for i in entries:
try:
ans = to_metadata(br, log, i, timeout)
if isinstance(ans, Metadata):
result_queue.put(ans)
for isbn in ans.all_isbns:
self.cache_isbn_to_identifier(isbn,
ans.identifiers['google'])
except:
log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
if abort.is_set():
break
def identify(self, log, result_queue, abort, title=None, authors=None, def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=5): identifiers={}, timeout=5):
@ -207,8 +229,8 @@ class GoogleBooks(Source):
return as_unicode(e) return as_unicode(e)
# There is no point running these queries in threads as google # There is no point running these queries in threads as google
# throttles requests returning Forbidden errors # throttles requests returning 403 Forbidden errors
get_all_details(br, log, entries, abort, result_queue, timeout) self.get_all_details(br, log, entries, abort, result_queue, timeout)
return None return None
@ -218,8 +240,14 @@ if __name__ == '__main__':
title_test) title_test)
test_identify_plugin(GoogleBooks.name, test_identify_plugin(GoogleBooks.name,
[ [
( (
{'title': 'Great Expectations', 'authors':['Charles Dickens']}, {'identifiers':{'isbn': '0743273567'}},
[title_test('Great Expectations', exact=True)] [title_test('The great gatsby', exact=True)]
), ),
#(
# {'title': 'Great Expectations', 'authors':['Charles Dickens']},
# [title_test('Great Expectations', exact=True)]
#),
]) ])