From 52c19d7b9bdd15bf74e84744f61a5b22c151fdb0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Feb 2011 20:34:06 -0700 Subject: [PATCH] caching of google identifiers and logic to get cover url from google identifier --- src/calibre/ebooks/metadata/sources/base.py | 16 ++++- src/calibre/ebooks/metadata/sources/google.py | 66 +++++++++++++------ 2 files changed, 62 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 74e184cc66..54d7d49d6d 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import re +import re, threading from calibre.customize import Plugin from calibre.utils.logging import ThreadSafeLog, FileStream @@ -30,7 +30,21 @@ class Source(Plugin): touched_fields = frozenset() + def __init__(self, *args, **kwargs): + Plugin.__init__(self, *args, **kwargs) + self._isbn_to_identifier_cache = {} + self.cache_lock = threading.RLock() + # Utility functions {{{ + + def cache_isbn_to_identifier(self, isbn, identifier): + with self.cache_lock: + self._isbn_to_identifier_cache[isbn] = identifier + + def cached_isbn_to_identifier(self, isbn): + with self.cache_lock: + return self._isbn_to_identifier_cache.get(isbn, None) + def get_author_tokens(self, authors, only_first_author=True): ''' Take a list of authors and return a list of tokens useful for an diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 498c7574ea..0720b21ded 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -13,6 +13,7 @@ from functools import partial from lxml import etree +from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode @@ -69,6 +70,7 @@ def to_metadata(browser, log, entry_, timeout): id_url = entry_id(entry_)[0].text + google_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: @@ -78,6 +80,7 @@ def to_metadata(browser, log, entry_, timeout): return None mi = Metadata(title_, authors) + mi.identifiers = {'google':google_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), @@ -103,9 +106,12 @@ def to_metadata(browser, log, entry_, timeout): t = str(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': - isbns.append(t[5:]) + t = check_isbn(t[5:]) + if t: + isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] + mi.all_isbns = isbns # Tags try: @@ -133,20 +139,6 @@ def to_metadata(browser, log, entry_, timeout): return mi -def get_all_details(br, log, entries, abort, result_queue, timeout): - for i in entries: - try: - ans = to_metadata(br, log, i, timeout) - if isinstance(ans, Metadata): - result_queue.put(ans) - except: - log.exception( - 'Failed to get metadata for identify entry:', - etree.tostring(i)) - if abort.is_set(): - break - - class GoogleBooks(Source): name = 'Google Books' @@ -185,6 +177,36 @@ class GoogleBooks(Source): 'min-viewability':'none', }) + def cover_url_from_identifiers(self, identifiers): + goog = identifiers.get('google', None) + if goog is None: + isbn = identifiers.get('isbn', None) + goog = self.cached_isbn_to_identifier(isbn) + if goog is not None: + return ('http://books.google.com/books?id=%s&printsec=frontcover&img=1' % + goog) + + def is_cover_image_valid(self, raw): + # When no cover is present, returns a PNG saying image not available + # Try for example google identifier llNqPwAACAAJ + # I have yet to see an actual cover in PNG format + return raw and len(raw) > 17000 and raw[1:4] != 'PNG' + + def get_all_details(self, br, log, entries, abort, result_queue, timeout): + for i in entries: + try: + ans = to_metadata(br, log, i, timeout) + if isinstance(ans, Metadata): + result_queue.put(ans) + for isbn in ans.all_isbns: + self.cache_isbn_to_identifier(isbn, + ans.identifiers['google']) + except: + log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) + if abort.is_set(): + break def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=5): @@ -207,8 +229,8 @@ class GoogleBooks(Source): return as_unicode(e) # There is no point running these queries in threads as google - # throttles requests returning Forbidden errors - get_all_details(br, log, entries, abort, result_queue, timeout) + # throttles requests returning 403 Forbidden errors + self.get_all_details(br, log, entries, abort, result_queue, timeout) return None @@ -218,8 +240,14 @@ if __name__ == '__main__': title_test) test_identify_plugin(GoogleBooks.name, [ + ( - {'title': 'Great Expectations', 'authors':['Charles Dickens']}, - [title_test('Great Expectations', exact=True)] + {'identifiers':{'isbn': '0743273567'}}, + [title_test('The great gatsby', exact=True)] ), + + #( + # {'title': 'Great Expectations', 'authors':['Charles Dickens']}, + # [title_test('Great Expectations', exact=True)] + #), ])