caching of google identifiers and logic to get cover url from google identifier

2025-07-08 18:54:09 -04:00 · 2011-02-22 20:34:06 -07:00 · 2011-02-22 20:34:06 -07:00 · 52c19d7b9b
commit 52c19d7b9b
parent 6460f08b7f
2 changed files with 62 additions and 20 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import re
+import re, threading
 from calibre.customize import Plugin
 from calibre.utils.logging import ThreadSafeLog, FileStream
@ -30,7 +30,21 @@ class Source(Plugin):
    touched_fields = frozenset()
    def __init__(self, *args, **kwargs):
        Plugin.__init__(self, *args, **kwargs)
        self._isbn_to_identifier_cache = {}
        self.cache_lock = threading.RLock()
    # Utility functions {{{
    def cache_isbn_to_identifier(self, isbn, identifier):
        with self.cache_lock:
            self._isbn_to_identifier_cache[isbn] = identifier
    def cached_isbn_to_identifier(self, isbn):
        with self.cache_lock:
            return self._isbn_to_identifier_cache.get(isbn, None)
    def get_author_tokens(self, authors, only_first_author=True):
        '''
        Take a list of authors and return a list of tokens useful for an
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -13,6 +13,7 @@ from functools import partial
 from lxml import etree
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
@ -69,6 +70,7 @@ def to_metadata(browser, log, entry_, timeout):
    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
@ -78,6 +80,7 @@ def to_metadata(browser, log, entry_, timeout):
        return None
    mi = Metadata(title_, authors)
    mi.identifiers = {'google':google_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@ -103,9 +106,12 @@ def to_metadata(browser, log, entry_, timeout):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
-                isbns.append(t[5:])
+                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns
    # Tags
    try:
@ -133,20 +139,6 @@ def to_metadata(browser, log, entry_, timeout):
    return mi
 def get_all_details(br, log, entries, abort, result_queue, timeout):
    for i in entries:
        try:
            ans = to_metadata(br, log, i, timeout)
            if isinstance(ans, Metadata):
                result_queue.put(ans)
        except:
            log.exception(
                'Failed to get metadata for identify entry:',
                etree.tostring(i))
        if abort.is_set():
            break
 class GoogleBooks(Source):
    name = 'Google Books'
@ -185,6 +177,36 @@ class GoogleBooks(Source):
            'min-viewability':'none',
            })
    def cover_url_from_identifiers(self, identifiers):
        goog = identifiers.get('google', None)
        if goog is None:
            isbn = identifiers.get('isbn', None)
            goog = self.cached_isbn_to_identifier(isbn)
        if goog is not None:
            return ('http://books.google.com/books?id=%s&printsec=frontcover&img=1' %
                goog)
    def is_cover_image_valid(self, raw):
        # When no cover is present, returns a PNG saying image not available
        # Try for example google identifier llNqPwAACAAJ
        # I have yet to see an actual cover in PNG format
        return raw and len(raw) > 17000 and raw[1:4] != 'PNG'
    def get_all_details(self, br, log, entries, abort, result_queue, timeout):
        for i in entries:
            try:
                ans = to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
                    result_queue.put(ans)
                    for isbn in ans.all_isbns:
                        self.cache_isbn_to_identifier(isbn,
                                ans.identifiers['google'])
            except:
                log.exception(
                    'Failed to get metadata for identify entry:',
                    etree.tostring(i))
            if abort.is_set():
                break
    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=5):
@ -207,8 +229,8 @@ class GoogleBooks(Source):
            return as_unicode(e)
        # There is no point running these queries in threads as google
-        # throttles requests returning Forbidden errors
+        # throttles requests returning 403 Forbidden errors
-        get_all_details(br, log, entries, abort, result_queue, timeout)
+        self.get_all_details(br, log, entries, abort, result_queue, timeout)
        return None
@ -218,8 +240,14 @@ if __name__ == '__main__':
            title_test)
    test_identify_plugin(GoogleBooks.name,
        [
            (
-                {'title': 'Great Expectations', 'authors':['Charles Dickens']},
+                {'identifiers':{'isbn': '0743273567'}},
-                [title_test('Great Expectations', exact=True)]
+                [title_test('The great gatsby', exact=True)]
            ),
            #(
            #    {'title': 'Great Expectations', 'authors':['Charles Dickens']},
            #    [title_test('Great Expectations', exact=True)]
            #),
    ])