diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a61ae2cb97..6cfe915036 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -95,22 +95,22 @@ class TXT2TXTZ(FileTypePlugin): file_types = set(['txt']) supported_platforms = ['windows', 'osx', 'linux'] on_import = True - + def _get_image_references(self, txt, base_dir): images = [] - + # Textile for m in re.finditer(ur'(?mu)(?:[\[{])?\!(?:\. )?(?P[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))', txt): path = m.group('path') if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)): images.append(path) - - # Markdown inline + + # Markdown inline for m in re.finditer(ur'(?mu)\!\[([^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*)\]\s*\((?P[^\)]*)\)', txt): path = m.group('path') if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)): images.append(path) - + # Markdown reference refs = {} for m in re.finditer(ur'(?mu)^(\ ?\ ?\ ?)\[(?P[^\]]*)\]:\s*(?P[^\s]*)$', txt): @@ -123,13 +123,13 @@ class TXT2TXTZ(FileTypePlugin): # Remove duplicates return list(set(images)) - + def run(self, path_to_ebook): with open(path_to_ebook, 'rb') as ebf: txt = ebf.read() base_dir = os.path.dirname(path_to_ebook) images = self._get_image_references(txt, base_dir) - + if images: # Create TXTZ and put file plus images inside of it. import zipfile @@ -1030,3 +1030,10 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions, Email, Server, Plugins, Tweaks, Misc, TemplateFunctions] #}}} + +# New metadata download plugins {{{ +from calibre.ebooks.metadata.sources.google import GoogleBooks + +plugins += [GoogleBooks] + +# }}} diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index c360122842..5f67e23d92 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -20,6 +20,7 @@ from calibre.ebooks.metadata.fetch import MetadataSource from calibre.utils.config import make_config_dir, Config, ConfigProxy, \ plugin_dir, OptionParser, prefs from calibre.ebooks.epub.fix import ePubFixer +from calibre.ebooks.metadata.sources.base import Source platform = 'linux' if iswindows: @@ -493,6 +494,17 @@ def epub_fixers(): yield plugin # }}} +# Metadata sources2 {{{ +def metadata_plugins(capabilities): + capabilities = frozenset(capabilities) + for plugin in _initialized_plugins: + if isinstance(plugin, Source) and \ + plugin.capabilities.intersection(capabilities) and \ + not is_disabled(plugin): + yield plugin + +# }}} + # Initialize plugins {{{ _initialized_plugins = [] diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 937245cfa9..e5490ef56e 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -1,5 +1,7 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' @@ -8,6 +10,12 @@ __docformat__ = 'restructuredtext en' import re from calibre.customize import Plugin +from calibre.utils.logging import ThreadSafeLog, FileStream + +def create_log(ostream=None): + log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) + log.outputs = [FileStream(ostream)] + return log class Source(Plugin): @@ -18,6 +26,11 @@ class Source(Plugin): result_of_identify_is_complete = True + capabilities = frozenset() + + touched_fields = frozenset() + + # Utility functions {{{ def get_author_tokens(self, authors, only_first_author=True): ''' Take a list of authors and return a list of tokens useful for an @@ -68,6 +81,10 @@ class Source(Plugin): gr.append(job) return [g for g in groups if g] + # }}} + + # Metadata API {{{ + def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): ''' Identify a book by its title/author/isbn/etc. @@ -87,3 +104,5 @@ class Source(Plugin): ''' return None + # }}} + diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 7e0e3a0901..fbc3aaa226 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -1,5 +1,7 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' @@ -8,12 +10,13 @@ __docformat__ = 'restructuredtext en' import time from urllib import urlencode from functools import partial -from threading import Thread, RLock +from threading import Thread from lxml import etree -from calibre.ebooks.metadata.sources import Source +from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre import browser, as_unicode @@ -38,7 +41,18 @@ subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') -_log_lock = RLock() +def get_details(browser, url): + try: + raw = browser.open_novisit(url).read() + except Exception as e: + gc = getattr(e, 'getcode', lambda : -1) + if gc() != 403: + raise + # Google is throttling us, wait a little + time.sleep(2) + raw = browser.open_novisit(url).read() + + return raw def to_metadata(browser, log, entry_): @@ -50,8 +64,7 @@ def to_metadata(browser, log, entry_): if ans and ans.strip(): return ans.strip() except: - with _log_lock: - log.exception('Programming error:') + log.exception('Programming error:') return None @@ -66,12 +79,11 @@ def to_metadata(browser, log, entry_): mi = Metadata(title_, authors) try: - raw = browser.open_novisit(id_url).read() - feed = etree.fromstring(raw) + raw = get_details(browser, id_url) + feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: - with _log_lock: - log.exception('Failed to get additional details for', mi.title) + log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) @@ -102,8 +114,7 @@ def to_metadata(browser, log, entry_): tags.extend([y.strip() for y in t.split('/')]) tags = list(sorted(list(set(tags)))) except: - with _log_lock: - log.exception('Failed to parse tags:') + log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] @@ -115,8 +126,7 @@ def to_metadata(browser, log, entry_): default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: - with _log_lock: - log.exception('Failed to parse pubdate') + log.exception('Failed to parse pubdate') return mi @@ -136,10 +146,9 @@ class Worker(Thread): if isinstance(ans, Metadata): self.result_queue.put(ans) except: - with _log_lock: - self.log.exception( - 'Failed to get metadata for identify entry:', - etree.tostring(i)) + self.log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) if self.abort.is_set(): break @@ -147,6 +156,11 @@ class Worker(Thread): class GoogleBooks(Source): name = 'Google Books' + description = _('Downloads metadata from Google Books') + + capabilities = frozenset(['identify']) + touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate', + 'comments', 'publisher', 'author_sort']) # language currently disabled def create_query(self, log, title=None, authors=None, identifiers={}, start_index=1): @@ -158,7 +172,7 @@ class GoogleBooks(Source): elif title or authors: def build_term(prefix, parts): return ' '.join('in'+prefix + ':' + x for x in parts) - title_tokens = list(self.get_title_tokens()) + title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = self.get_author_tokens(authors, @@ -190,7 +204,8 @@ class GoogleBooks(Source): try: parser = etree.XMLParser(recover=True, no_network=True) - feed = etree.fromstring(raw, parser=parser) + feed = etree.fromstring(xml_to_unicode(raw, + strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) except Exception, e: log.exception('Failed to parse identify results') @@ -218,4 +233,14 @@ class GoogleBooks(Source): return None - +if __name__ == '__main__': + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py + from calibre.ebooks.metadata.sources.test import (test_identify_plugin, + isbn_test) + test_identify_plugin(GoogleBooks.name, + [ + ( + {'title': 'Great Expectations', 'authors':['Charles Dickens']}, + [isbn_test('9781607541592')] + ), + ]) diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py new file mode 100644 index 0000000000..3892f31623 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, tempfile +from Queue import Queue, Empty +from threading import Event + + +from calibre.customize.ui import metadata_plugins +from calibre import prints +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import create_log + +def isbn_test(isbn): + isbn_ = check_isbn(isbn) + + def test(mi): + misbn = check_isbn(mi.isbn) + return misbn and misbn == isbn_ + + return test + +def test_identify_plugin(name, tests): + ''' + :param name: Plugin name + :param tests: List of 2-tuples. Each two tuple is of the form (args, + test_funcs). args is a dict of keyword arguments to pass to + the identify method. test_funcs are callables that accept a + Metadata object and return True iff the object passes the + test. + ''' + plugin = None + for x in metadata_plugins(['identify']): + if x.name == name: + plugin = x + break + prints('Testing the identify function of', plugin.name) + + tdir = tempfile.gettempdir() + lf = os.path.join(tdir, plugin.name.replace(' ', '')+'_identify_test.txt') + log = create_log(open(lf, 'wb')) + abort = Event() + prints('Log saved to', lf) + + for kwargs, test_funcs in tests: + prints('Running test with:', kwargs) + rq = Queue() + args = (log, rq, abort) + err = plugin.identify(*args, **kwargs) + if err is not None: + prints('identify returned an error for args', args) + prints(err) + break + + results = [] + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + + prints('Found', len(results), 'matches:') + + for mi in results: + prints(mi) + prints('\n\n') + + match_found = None + for mi in results: + test_failed = False + for tfunc in test_funcs: + if not tfunc(mi): + test_failed = True + break + if not test_failed: + match_found = mi + break + + if match_found is None: + prints('ERROR: No results that passed all tests were found') + prints('Log saved to', lf) + raise SystemExit(1) + + prints('Log saved to', lf) + diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py index 98c7da178e..e7d8774b85 100644 --- a/src/calibre/utils/logging.py +++ b/src/calibre/utils/logging.py @@ -10,17 +10,19 @@ INFO = 1 WARN = 2 ERROR = 3 -import sys, traceback +import sys, traceback, cStringIO from functools import partial - +from threading import RLock class Stream(object): - def __init__(self, stream): + def __init__(self, stream=None): from calibre import prints self._prints = partial(prints, safe_encode=True) + if stream is None: + stream = cStringIO.StringIO() self.stream = stream def flush(self): @@ -50,6 +52,15 @@ class ANSIStream(Stream): def flush(self): self.stream.flush() +class FileStream(Stream): + + def __init__(self, stream=None): + Stream.__init__(self, stream) + + def prints(self, level, *args, **kwargs): + kwargs['file'] = self.stream + self._prints(*args, **kwargs) + class HTMLStream(Stream): def __init__(self, stream=sys.stdout): @@ -103,4 +114,14 @@ class Log(object): def __call__(self, *args, **kwargs): self.prints(INFO, *args, **kwargs) +class ThreadSafeLog(Log): + + def __init__(self, level=Log.INFO): + Log.__init__(self, level=level) + self._lock = RLock() + + def prints(self, *args, **kwargs): + with self._lock: + Log.prints(self, *args, **kwargs) + default_log = Log()