Google books metadata download plugin ported to new infrastructure

2025-08-05 08:40:13 -04:00 · 2011-02-15 22:17:02 -07:00 · 2011-02-15 22:17:02 -07:00 · 2b4016901e
commit 2b4016901e
parent c0e78379c3
6 changed files with 205 additions and 30 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -95,22 +95,22 @@ class TXT2TXTZ(FileTypePlugin):
    file_types = set(['txt'])
    supported_platforms = ['windows', 'osx', 'linux']
    on_import = True
-    
+
    def _get_image_references(self, txt, base_dir):
        images = []
-        
+
        # Textile
        for m in re.finditer(ur'(?mu)(?:[\[{])?\!(?:\. )?(?P<path>[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))', txt):
            path = m.group('path')
            if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)):
                images.append(path)
-                
-        # Markdown inline        
+
+        # Markdown inline
        for m in re.finditer(ur'(?mu)\!\[([^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*)\]\s*\((?P<path>[^\)]*)\)', txt):
            path = m.group('path')
            if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)):
                images.append(path)
-        
+
        # Markdown reference
        refs = {}
        for m in re.finditer(ur'(?mu)^(\ ?\ ?\ ?)\[(?P<id>[^\]]*)\]:\s*(?P<path>[^\s]*)$', txt):
@ -123,13 +123,13 @@ class TXT2TXTZ(FileTypePlugin):

        # Remove duplicates
        return list(set(images))
-    
+
    def run(self, path_to_ebook):
        with open(path_to_ebook, 'rb') as ebf:
            txt = ebf.read()
        base_dir = os.path.dirname(path_to_ebook)
        images = self._get_image_references(txt, base_dir)
-        
+
        if images:
            # Create TXTZ and put file plus images inside of it.
            import zipfile
@ -1030,3 +1030,10 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions,
        Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]

 #}}}
+
+# New metadata download plugins {{{
+from calibre.ebooks.metadata.sources.google import GoogleBooks
+
+plugins += [GoogleBooks]
+
+# }}}
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -20,6 +20,7 @@ from calibre.ebooks.metadata.fetch import MetadataSource
 from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
                                 plugin_dir, OptionParser, prefs
 from calibre.ebooks.epub.fix import ePubFixer
+from calibre.ebooks.metadata.sources.base import Source

 platform = 'linux'
 if iswindows:
@ -493,6 +494,17 @@ def epub_fixers():
                    yield plugin
 # }}}

+# Metadata sources2 {{{
+def metadata_plugins(capabilities):
+    capabilities = frozenset(capabilities)
+    for plugin in _initialized_plugins:
+        if isinstance(plugin, Source) and \
+                plugin.capabilities.intersection(capabilities) and \
+                not is_disabled(plugin):
+            yield plugin
+
+# }}}
+
 # Initialize plugins {{{

 _initialized_plugins = []
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -1,5 +1,7 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)

 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
@ -8,6 +10,12 @@ __docformat__ = 'restructuredtext en'
 import re

 from calibre.customize import Plugin
+from calibre.utils.logging import ThreadSafeLog, FileStream
+
+def create_log(ostream=None):
+    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
+    log.outputs = [FileStream(ostream)]
+    return log

 class Source(Plugin):

@ -18,6 +26,11 @@ class Source(Plugin):

    result_of_identify_is_complete = True

+    capabilities = frozenset()
+
+    touched_fields = frozenset()
+
+    # Utility functions {{{
    def get_author_tokens(self, authors, only_first_author=True):
        '''
        Take a list of authors and return a list of tokens useful for an
@ -68,6 +81,10 @@ class Source(Plugin):
                gr.append(job)
        return [g for g in groups if g]

+    # }}}
+
+    # Metadata API {{{
+
    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
        '''
        Identify a book by its title/author/isbn/etc.
@ -87,3 +104,5 @@ class Source(Plugin):
        '''
        return None

+    # }}}
+
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -1,5 +1,7 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)

 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
@ -8,12 +10,13 @@ __docformat__ = 'restructuredtext en'
 import time
 from urllib import urlencode
 from functools import partial
-from threading import Thread, RLock
+from threading import Thread

 from lxml import etree

-from calibre.ebooks.metadata.sources import Source
+from calibre.ebooks.metadata.sources.base import Source
 from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.date import parse_date, utcnow
 from calibre import browser, as_unicode

@ -38,7 +41,18 @@ subject        = XPath('descendant::dc:subject')
 description    = XPath('descendant::dc:description')
 language       = XPath('descendant::dc:language')

-_log_lock = RLock()
+def get_details(browser, url):
+    try:
+        raw = browser.open_novisit(url).read()
+    except Exception as e:
+        gc = getattr(e, 'getcode', lambda : -1)
+        if gc() != 403:
+            raise
+        # Google is throttling us, wait a little
+        time.sleep(2)
+        raw = browser.open_novisit(url).read()
+
+    return raw

 def to_metadata(browser, log, entry_):

@ -50,8 +64,7 @@ def to_metadata(browser, log, entry_):
                if ans and ans.strip():
                    return ans.strip()
        except:
-            with _log_lock:
-                log.exception('Programming error:')
+            log.exception('Programming error:')
        return None


@ -66,12 +79,11 @@ def to_metadata(browser, log, entry_):

    mi = Metadata(title_, authors)
    try:
-        raw = browser.open_novisit(id_url).read()
-        feed = etree.fromstring(raw)
+        raw = get_details(browser, id_url)
+        feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
-        with _log_lock:
-            log.exception('Failed to get additional details for', mi.title)
+        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
@ -102,8 +114,7 @@ def to_metadata(browser, log, entry_):
            tags.extend([y.strip() for y in t.split('/')])
        tags = list(sorted(list(set(tags))))
    except:
-        with _log_lock:
-            log.exception('Failed to parse tags:')
+        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]
@ -115,8 +126,7 @@ def to_metadata(browser, log, entry_):
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
-            with _log_lock:
-                log.exception('Failed to parse pubdate')
+            log.exception('Failed to parse pubdate')


    return mi
@ -136,10 +146,9 @@ class Worker(Thread):
                if isinstance(ans, Metadata):
                    self.result_queue.put(ans)
            except:
-                with _log_lock:
-                    self.log.exception(
-                        'Failed to get metadata for identify entry:',
-                        etree.tostring(i))
+                self.log.exception(
+                    'Failed to get metadata for identify entry:',
+                    etree.tostring(i))
            if self.abort.is_set():
                break

@ -147,6 +156,11 @@ class Worker(Thread):
 class GoogleBooks(Source):

    name = 'Google Books'
+    description = _('Downloads metadata from Google Books')
+
+    capabilities = frozenset(['identify'])
+    touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate',
+        'comments', 'publisher', 'author_sort']) # language currently disabled

    def create_query(self, log, title=None, authors=None, identifiers={},
            start_index=1):
@ -158,7 +172,7 @@ class GoogleBooks(Source):
        elif title or authors:
            def build_term(prefix, parts):
                return ' '.join('in'+prefix + ':' + x for x in parts)
-            title_tokens = list(self.get_title_tokens())
+            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                q += build_term('title', title_tokens)
            author_tokens = self.get_author_tokens(authors,
@ -190,7 +204,8 @@ class GoogleBooks(Source):

        try:
            parser = etree.XMLParser(recover=True, no_network=True)
-            feed = etree.fromstring(raw, parser=parser)
+            feed = etree.fromstring(xml_to_unicode(raw,
+                strip_encoding_pats=True)[0], parser=parser)
            entries = entry(feed)
        except Exception, e:
            log.exception('Failed to parse identify results')
@ -218,4 +233,14 @@ class GoogleBooks(Source):

        return None

-
+if __name__ == '__main__':
+    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
+    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
+            isbn_test)
+    test_identify_plugin(GoogleBooks.name,
+        [
+            (
+                {'title': 'Great Expectations', 'authors':['Charles Dickens']},
+                [isbn_test('9781607541592')]
+            ),
+    ])
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, tempfile
+from Queue import Queue, Empty
+from threading import Event
+
+
+from calibre.customize.ui import metadata_plugins
+from calibre import prints
+from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata.sources.base import create_log
+
+def isbn_test(isbn):
+    isbn_ = check_isbn(isbn)
+
+    def test(mi):
+        misbn = check_isbn(mi.isbn)
+        return misbn and misbn == isbn_
+
+    return test
+
+def test_identify_plugin(name, tests):
+    '''
+    :param name: Plugin name
+    :param tests: List of 2-tuples. Each two tuple is of the form (args,
+                  test_funcs). args is a dict of keyword arguments to pass to
+                  the identify method. test_funcs are callables that accept a
+                  Metadata object and return True iff the object passes the
+                  test.
+    '''
+    plugin = None
+    for x in metadata_plugins(['identify']):
+        if x.name == name:
+            plugin = x
+            break
+    prints('Testing the identify function of', plugin.name)
+
+    tdir = tempfile.gettempdir()
+    lf = os.path.join(tdir, plugin.name.replace(' ', '')+'_identify_test.txt')
+    log = create_log(open(lf, 'wb'))
+    abort = Event()
+    prints('Log saved to', lf)
+
+    for kwargs, test_funcs in tests:
+        prints('Running test with:', kwargs)
+        rq = Queue()
+        args = (log, rq, abort)
+        err = plugin.identify(*args, **kwargs)
+        if err is not None:
+            prints('identify returned an error for args', args)
+            prints(err)
+            break
+
+        results = []
+        while True:
+            try:
+                results.append(rq.get_nowait())
+            except Empty:
+                break
+
+        prints('Found', len(results), 'matches:')
+
+        for mi in results:
+            prints(mi)
+            prints('\n\n')
+
+        match_found = None
+        for mi in results:
+            test_failed = False
+            for tfunc in test_funcs:
+                if not tfunc(mi):
+                    test_failed = True
+                    break
+            if not test_failed:
+                match_found = mi
+                break
+
+        if match_found is None:
+            prints('ERROR: No results that passed all tests were found')
+            prints('Log saved to', lf)
+            raise SystemExit(1)
+
+    prints('Log saved to', lf)
+
--- a/src/calibre/utils/logging.py
+++ b/src/calibre/utils/logging.py
@ -10,17 +10,19 @@ INFO  = 1
 WARN  = 2
 ERROR = 3

-import sys, traceback
+import sys, traceback, cStringIO
 from functools import partial
-
+from threading import RLock



 class Stream(object):

-    def __init__(self, stream):
+    def __init__(self, stream=None):
        from calibre import prints
        self._prints = partial(prints, safe_encode=True)
+        if stream is None:
+            stream = cStringIO.StringIO()
        self.stream = stream

    def flush(self):
@ -50,6 +52,15 @@ class ANSIStream(Stream):
    def flush(self):
        self.stream.flush()

+class FileStream(Stream):
+
+    def __init__(self, stream=None):
+        Stream.__init__(self, stream)
+
+    def prints(self, level, *args, **kwargs):
+        kwargs['file'] = self.stream
+        self._prints(*args, **kwargs)
+
 class HTMLStream(Stream):

    def __init__(self, stream=sys.stdout):
@ -103,4 +114,14 @@ class Log(object):
    def __call__(self, *args, **kwargs):
        self.prints(INFO, *args, **kwargs)

+class ThreadSafeLog(Log):
+
+    def __init__(self, level=Log.INFO):
+        Log.__init__(self, level=level)
+        self._lock = RLock()
+
+    def prints(self, *args, **kwargs):
+        with self._lock:
+            Log.prints(self, *args, **kwargs)
+
 default_log = Log()