Google books metadata download plugin ported to new infrastructure

This commit is contained in:
Kovid Goyal 2011-02-15 22:17:02 -07:00
parent c0e78379c3
commit 2b4016901e
6 changed files with 205 additions and 30 deletions

View File

@ -95,22 +95,22 @@ class TXT2TXTZ(FileTypePlugin):
file_types = set(['txt'])
supported_platforms = ['windows', 'osx', 'linux']
on_import = True
def _get_image_references(self, txt, base_dir):
images = []
# Textile
for m in re.finditer(ur'(?mu)(?:[\[{])?\!(?:\. )?(?P<path>[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))', txt):
path = m.group('path')
if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)):
images.append(path)
# Markdown inline
# Markdown inline
for m in re.finditer(ur'(?mu)\!\[([^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*)\]\s*\((?P<path>[^\)]*)\)', txt):
path = m.group('path')
if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)):
images.append(path)
# Markdown reference
refs = {}
for m in re.finditer(ur'(?mu)^(\ ?\ ?\ ?)\[(?P<id>[^\]]*)\]:\s*(?P<path>[^\s]*)$', txt):
@ -123,13 +123,13 @@ class TXT2TXTZ(FileTypePlugin):
# Remove duplicates
return list(set(images))
def run(self, path_to_ebook):
with open(path_to_ebook, 'rb') as ebf:
txt = ebf.read()
base_dir = os.path.dirname(path_to_ebook)
images = self._get_image_references(txt, base_dir)
if images:
# Create TXTZ and put file plus images inside of it.
import zipfile
@ -1030,3 +1030,10 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions,
Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]
#}}}
# New metadata download plugins {{{
from calibre.ebooks.metadata.sources.google import GoogleBooks
plugins += [GoogleBooks]
# }}}

View File

@ -20,6 +20,7 @@ from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
plugin_dir, OptionParser, prefs
from calibre.ebooks.epub.fix import ePubFixer
from calibre.ebooks.metadata.sources.base import Source
platform = 'linux'
if iswindows:
@ -493,6 +494,17 @@ def epub_fixers():
yield plugin
# }}}
# Metadata sources2 {{{
def metadata_plugins(capabilities):
capabilities = frozenset(capabilities)
for plugin in _initialized_plugins:
if isinstance(plugin, Source) and \
plugin.capabilities.intersection(capabilities) and \
not is_disabled(plugin):
yield plugin
# }}}
# Initialize plugins {{{
_initialized_plugins = []

View File

@ -1,5 +1,7 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
@ -8,6 +10,12 @@ __docformat__ = 'restructuredtext en'
import re
from calibre.customize import Plugin
from calibre.utils.logging import ThreadSafeLog, FileStream
def create_log(ostream=None):
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
log.outputs = [FileStream(ostream)]
return log
class Source(Plugin):
@ -18,6 +26,11 @@ class Source(Plugin):
result_of_identify_is_complete = True
capabilities = frozenset()
touched_fields = frozenset()
# Utility functions {{{
def get_author_tokens(self, authors, only_first_author=True):
'''
Take a list of authors and return a list of tokens useful for an
@ -68,6 +81,10 @@ class Source(Plugin):
gr.append(job)
return [g for g in groups if g]
# }}}
# Metadata API {{{
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
'''
Identify a book by its title/author/isbn/etc.
@ -87,3 +104,5 @@ class Source(Plugin):
'''
return None
# }}}

View File

@ -1,5 +1,7 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
@ -8,12 +10,13 @@ __docformat__ = 'restructuredtext en'
import time
from urllib import urlencode
from functools import partial
from threading import Thread, RLock
from threading import Thread
from lxml import etree
from calibre.ebooks.metadata.sources import Source
from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.date import parse_date, utcnow
from calibre import browser, as_unicode
@ -38,7 +41,18 @@ subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language')
_log_lock = RLock()
def get_details(browser, url):
try:
raw = browser.open_novisit(url).read()
except Exception as e:
gc = getattr(e, 'getcode', lambda : -1)
if gc() != 403:
raise
# Google is throttling us, wait a little
time.sleep(2)
raw = browser.open_novisit(url).read()
return raw
def to_metadata(browser, log, entry_):
@ -50,8 +64,7 @@ def to_metadata(browser, log, entry_):
if ans and ans.strip():
return ans.strip()
except:
with _log_lock:
log.exception('Programming error:')
log.exception('Programming error:')
return None
@ -66,12 +79,11 @@ def to_metadata(browser, log, entry_):
mi = Metadata(title_, authors)
try:
raw = browser.open_novisit(id_url).read()
feed = etree.fromstring(raw)
raw = get_details(browser, id_url)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0])
extra = entry(feed)[0]
except:
with _log_lock:
log.exception('Failed to get additional details for', mi.title)
log.exception('Failed to get additional details for', mi.title)
return mi
mi.comments = get_text(extra, description)
@ -102,8 +114,7 @@ def to_metadata(browser, log, entry_):
tags.extend([y.strip() for y in t.split('/')])
tags = list(sorted(list(set(tags))))
except:
with _log_lock:
log.exception('Failed to parse tags:')
log.exception('Failed to parse tags:')
tags = []
if tags:
mi.tags = [x.replace(',', ';') for x in tags]
@ -115,8 +126,7 @@ def to_metadata(browser, log, entry_):
default = utcnow().replace(day=15)
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
except:
with _log_lock:
log.exception('Failed to parse pubdate')
log.exception('Failed to parse pubdate')
return mi
@ -136,10 +146,9 @@ class Worker(Thread):
if isinstance(ans, Metadata):
self.result_queue.put(ans)
except:
with _log_lock:
self.log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
self.log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
if self.abort.is_set():
break
@ -147,6 +156,11 @@ class Worker(Thread):
class GoogleBooks(Source):
name = 'Google Books'
description = _('Downloads metadata from Google Books')
capabilities = frozenset(['identify'])
touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate',
'comments', 'publisher', 'author_sort']) # language currently disabled
def create_query(self, log, title=None, authors=None, identifiers={},
start_index=1):
@ -158,7 +172,7 @@ class GoogleBooks(Source):
elif title or authors:
def build_term(prefix, parts):
return ' '.join('in'+prefix + ':' + x for x in parts)
title_tokens = list(self.get_title_tokens())
title_tokens = list(self.get_title_tokens(title))
if title_tokens:
q += build_term('title', title_tokens)
author_tokens = self.get_author_tokens(authors,
@ -190,7 +204,8 @@ class GoogleBooks(Source):
try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(raw, parser=parser)
feed = etree.fromstring(xml_to_unicode(raw,
strip_encoding_pats=True)[0], parser=parser)
entries = entry(feed)
except Exception, e:
log.exception('Failed to parse identify results')
@ -218,4 +233,14 @@ class GoogleBooks(Source):
return None
if __name__ == '__main__':
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test)
test_identify_plugin(GoogleBooks.name,
[
(
{'title': 'Great Expectations', 'authors':['Charles Dickens']},
[isbn_test('9781607541592')]
),
])

View File

@ -0,0 +1,91 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, tempfile
from Queue import Queue, Empty
from threading import Event
from calibre.customize.ui import metadata_plugins
from calibre import prints
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import create_log
def isbn_test(isbn):
isbn_ = check_isbn(isbn)
def test(mi):
misbn = check_isbn(mi.isbn)
return misbn and misbn == isbn_
return test
def test_identify_plugin(name, tests):
'''
:param name: Plugin name
:param tests: List of 2-tuples. Each two tuple is of the form (args,
test_funcs). args is a dict of keyword arguments to pass to
the identify method. test_funcs are callables that accept a
Metadata object and return True iff the object passes the
test.
'''
plugin = None
for x in metadata_plugins(['identify']):
if x.name == name:
plugin = x
break
prints('Testing the identify function of', plugin.name)
tdir = tempfile.gettempdir()
lf = os.path.join(tdir, plugin.name.replace(' ', '')+'_identify_test.txt')
log = create_log(open(lf, 'wb'))
abort = Event()
prints('Log saved to', lf)
for kwargs, test_funcs in tests:
prints('Running test with:', kwargs)
rq = Queue()
args = (log, rq, abort)
err = plugin.identify(*args, **kwargs)
if err is not None:
prints('identify returned an error for args', args)
prints(err)
break
results = []
while True:
try:
results.append(rq.get_nowait())
except Empty:
break
prints('Found', len(results), 'matches:')
for mi in results:
prints(mi)
prints('\n\n')
match_found = None
for mi in results:
test_failed = False
for tfunc in test_funcs:
if not tfunc(mi):
test_failed = True
break
if not test_failed:
match_found = mi
break
if match_found is None:
prints('ERROR: No results that passed all tests were found')
prints('Log saved to', lf)
raise SystemExit(1)
prints('Log saved to', lf)

View File

@ -10,17 +10,19 @@ INFO = 1
WARN = 2
ERROR = 3
import sys, traceback
import sys, traceback, cStringIO
from functools import partial
from threading import RLock
class Stream(object):
def __init__(self, stream):
def __init__(self, stream=None):
from calibre import prints
self._prints = partial(prints, safe_encode=True)
if stream is None:
stream = cStringIO.StringIO()
self.stream = stream
def flush(self):
@ -50,6 +52,15 @@ class ANSIStream(Stream):
def flush(self):
self.stream.flush()
class FileStream(Stream):
def __init__(self, stream=None):
Stream.__init__(self, stream)
def prints(self, level, *args, **kwargs):
kwargs['file'] = self.stream
self._prints(*args, **kwargs)
class HTMLStream(Stream):
def __init__(self, stream=sys.stdout):
@ -103,4 +114,14 @@ class Log(object):
def __call__(self, *args, **kwargs):
self.prints(INFO, *args, **kwargs)
class ThreadSafeLog(Log):
def __init__(self, level=Log.INFO):
Log.__init__(self, level=level)
self._lock = RLock()
def prints(self, *args, **kwargs):
with self._lock:
Log.prints(self, *args, **kwargs)
default_log = Log()