mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-05 08:40:13 -04:00
Google books metadata download plugin ported to new infrastructure
This commit is contained in:
parent
c0e78379c3
commit
2b4016901e
@ -95,22 +95,22 @@ class TXT2TXTZ(FileTypePlugin):
|
||||
file_types = set(['txt'])
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
on_import = True
|
||||
|
||||
|
||||
def _get_image_references(self, txt, base_dir):
|
||||
images = []
|
||||
|
||||
|
||||
# Textile
|
||||
for m in re.finditer(ur'(?mu)(?:[\[{])?\!(?:\. )?(?P<path>[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))', txt):
|
||||
path = m.group('path')
|
||||
if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)):
|
||||
images.append(path)
|
||||
|
||||
# Markdown inline
|
||||
|
||||
# Markdown inline
|
||||
for m in re.finditer(ur'(?mu)\!\[([^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*)\]\s*\((?P<path>[^\)]*)\)', txt):
|
||||
path = m.group('path')
|
||||
if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)):
|
||||
images.append(path)
|
||||
|
||||
|
||||
# Markdown reference
|
||||
refs = {}
|
||||
for m in re.finditer(ur'(?mu)^(\ ?\ ?\ ?)\[(?P<id>[^\]]*)\]:\s*(?P<path>[^\s]*)$', txt):
|
||||
@ -123,13 +123,13 @@ class TXT2TXTZ(FileTypePlugin):
|
||||
|
||||
# Remove duplicates
|
||||
return list(set(images))
|
||||
|
||||
|
||||
def run(self, path_to_ebook):
|
||||
with open(path_to_ebook, 'rb') as ebf:
|
||||
txt = ebf.read()
|
||||
base_dir = os.path.dirname(path_to_ebook)
|
||||
images = self._get_image_references(txt, base_dir)
|
||||
|
||||
|
||||
if images:
|
||||
# Create TXTZ and put file plus images inside of it.
|
||||
import zipfile
|
||||
@ -1030,3 +1030,10 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions,
|
||||
Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]
|
||||
|
||||
#}}}
|
||||
|
||||
# New metadata download plugins {{{
|
||||
from calibre.ebooks.metadata.sources.google import GoogleBooks
|
||||
|
||||
plugins += [GoogleBooks]
|
||||
|
||||
# }}}
|
||||
|
@ -20,6 +20,7 @@ from calibre.ebooks.metadata.fetch import MetadataSource
|
||||
from calibre.utils.config import make_config_dir, Config, ConfigProxy, \
|
||||
plugin_dir, OptionParser, prefs
|
||||
from calibre.ebooks.epub.fix import ePubFixer
|
||||
from calibre.ebooks.metadata.sources.base import Source
|
||||
|
||||
platform = 'linux'
|
||||
if iswindows:
|
||||
@ -493,6 +494,17 @@ def epub_fixers():
|
||||
yield plugin
|
||||
# }}}
|
||||
|
||||
# Metadata sources2 {{{
|
||||
def metadata_plugins(capabilities):
|
||||
capabilities = frozenset(capabilities)
|
||||
for plugin in _initialized_plugins:
|
||||
if isinstance(plugin, Source) and \
|
||||
plugin.capabilities.intersection(capabilities) and \
|
||||
not is_disabled(plugin):
|
||||
yield plugin
|
||||
|
||||
# }}}
|
||||
|
||||
# Initialize plugins {{{
|
||||
|
||||
_initialized_plugins = []
|
||||
|
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
@ -8,6 +10,12 @@ __docformat__ = 'restructuredtext en'
|
||||
import re
|
||||
|
||||
from calibre.customize import Plugin
|
||||
from calibre.utils.logging import ThreadSafeLog, FileStream
|
||||
|
||||
def create_log(ostream=None):
|
||||
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
|
||||
log.outputs = [FileStream(ostream)]
|
||||
return log
|
||||
|
||||
class Source(Plugin):
|
||||
|
||||
@ -18,6 +26,11 @@ class Source(Plugin):
|
||||
|
||||
result_of_identify_is_complete = True
|
||||
|
||||
capabilities = frozenset()
|
||||
|
||||
touched_fields = frozenset()
|
||||
|
||||
# Utility functions {{{
|
||||
def get_author_tokens(self, authors, only_first_author=True):
|
||||
'''
|
||||
Take a list of authors and return a list of tokens useful for an
|
||||
@ -68,6 +81,10 @@ class Source(Plugin):
|
||||
gr.append(job)
|
||||
return [g for g in groups if g]
|
||||
|
||||
# }}}
|
||||
|
||||
# Metadata API {{{
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||
'''
|
||||
Identify a book by its title/author/isbn/etc.
|
||||
@ -87,3 +104,5 @@ class Source(Plugin):
|
||||
'''
|
||||
return None
|
||||
|
||||
# }}}
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
@ -8,12 +10,13 @@ __docformat__ = 'restructuredtext en'
|
||||
import time
|
||||
from urllib import urlencode
|
||||
from functools import partial
|
||||
from threading import Thread, RLock
|
||||
from threading import Thread
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.metadata.sources import Source
|
||||
from calibre.ebooks.metadata.sources.base import Source
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre import browser, as_unicode
|
||||
|
||||
@ -38,7 +41,18 @@ subject = XPath('descendant::dc:subject')
|
||||
description = XPath('descendant::dc:description')
|
||||
language = XPath('descendant::dc:language')
|
||||
|
||||
_log_lock = RLock()
|
||||
def get_details(browser, url):
|
||||
try:
|
||||
raw = browser.open_novisit(url).read()
|
||||
except Exception as e:
|
||||
gc = getattr(e, 'getcode', lambda : -1)
|
||||
if gc() != 403:
|
||||
raise
|
||||
# Google is throttling us, wait a little
|
||||
time.sleep(2)
|
||||
raw = browser.open_novisit(url).read()
|
||||
|
||||
return raw
|
||||
|
||||
def to_metadata(browser, log, entry_):
|
||||
|
||||
@ -50,8 +64,7 @@ def to_metadata(browser, log, entry_):
|
||||
if ans and ans.strip():
|
||||
return ans.strip()
|
||||
except:
|
||||
with _log_lock:
|
||||
log.exception('Programming error:')
|
||||
log.exception('Programming error:')
|
||||
return None
|
||||
|
||||
|
||||
@ -66,12 +79,11 @@ def to_metadata(browser, log, entry_):
|
||||
|
||||
mi = Metadata(title_, authors)
|
||||
try:
|
||||
raw = browser.open_novisit(id_url).read()
|
||||
feed = etree.fromstring(raw)
|
||||
raw = get_details(browser, id_url)
|
||||
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0])
|
||||
extra = entry(feed)[0]
|
||||
except:
|
||||
with _log_lock:
|
||||
log.exception('Failed to get additional details for', mi.title)
|
||||
log.exception('Failed to get additional details for', mi.title)
|
||||
return mi
|
||||
|
||||
mi.comments = get_text(extra, description)
|
||||
@ -102,8 +114,7 @@ def to_metadata(browser, log, entry_):
|
||||
tags.extend([y.strip() for y in t.split('/')])
|
||||
tags = list(sorted(list(set(tags))))
|
||||
except:
|
||||
with _log_lock:
|
||||
log.exception('Failed to parse tags:')
|
||||
log.exception('Failed to parse tags:')
|
||||
tags = []
|
||||
if tags:
|
||||
mi.tags = [x.replace(',', ';') for x in tags]
|
||||
@ -115,8 +126,7 @@ def to_metadata(browser, log, entry_):
|
||||
default = utcnow().replace(day=15)
|
||||
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||
except:
|
||||
with _log_lock:
|
||||
log.exception('Failed to parse pubdate')
|
||||
log.exception('Failed to parse pubdate')
|
||||
|
||||
|
||||
return mi
|
||||
@ -136,10 +146,9 @@ class Worker(Thread):
|
||||
if isinstance(ans, Metadata):
|
||||
self.result_queue.put(ans)
|
||||
except:
|
||||
with _log_lock:
|
||||
self.log.exception(
|
||||
'Failed to get metadata for identify entry:',
|
||||
etree.tostring(i))
|
||||
self.log.exception(
|
||||
'Failed to get metadata for identify entry:',
|
||||
etree.tostring(i))
|
||||
if self.abort.is_set():
|
||||
break
|
||||
|
||||
@ -147,6 +156,11 @@ class Worker(Thread):
|
||||
class GoogleBooks(Source):
|
||||
|
||||
name = 'Google Books'
|
||||
description = _('Downloads metadata from Google Books')
|
||||
|
||||
capabilities = frozenset(['identify'])
|
||||
touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate',
|
||||
'comments', 'publisher', 'author_sort']) # language currently disabled
|
||||
|
||||
def create_query(self, log, title=None, authors=None, identifiers={},
|
||||
start_index=1):
|
||||
@ -158,7 +172,7 @@ class GoogleBooks(Source):
|
||||
elif title or authors:
|
||||
def build_term(prefix, parts):
|
||||
return ' '.join('in'+prefix + ':' + x for x in parts)
|
||||
title_tokens = list(self.get_title_tokens())
|
||||
title_tokens = list(self.get_title_tokens(title))
|
||||
if title_tokens:
|
||||
q += build_term('title', title_tokens)
|
||||
author_tokens = self.get_author_tokens(authors,
|
||||
@ -190,7 +204,8 @@ class GoogleBooks(Source):
|
||||
|
||||
try:
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
feed = etree.fromstring(raw, parser=parser)
|
||||
feed = etree.fromstring(xml_to_unicode(raw,
|
||||
strip_encoding_pats=True)[0], parser=parser)
|
||||
entries = entry(feed)
|
||||
except Exception, e:
|
||||
log.exception('Failed to parse identify results')
|
||||
@ -218,4 +233,14 @@ class GoogleBooks(Source):
|
||||
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
|
||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||
isbn_test)
|
||||
test_identify_plugin(GoogleBooks.name,
|
||||
[
|
||||
(
|
||||
{'title': 'Great Expectations', 'authors':['Charles Dickens']},
|
||||
[isbn_test('9781607541592')]
|
||||
),
|
||||
])
|
||||
|
91
src/calibre/ebooks/metadata/sources/test.py
Normal file
91
src/calibre/ebooks/metadata/sources/test.py
Normal file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, tempfile
|
||||
from Queue import Queue, Empty
|
||||
from threading import Event
|
||||
|
||||
|
||||
from calibre.customize.ui import metadata_plugins
|
||||
from calibre import prints
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
from calibre.ebooks.metadata.sources.base import create_log
|
||||
|
||||
def isbn_test(isbn):
|
||||
isbn_ = check_isbn(isbn)
|
||||
|
||||
def test(mi):
|
||||
misbn = check_isbn(mi.isbn)
|
||||
return misbn and misbn == isbn_
|
||||
|
||||
return test
|
||||
|
||||
def test_identify_plugin(name, tests):
|
||||
'''
|
||||
:param name: Plugin name
|
||||
:param tests: List of 2-tuples. Each two tuple is of the form (args,
|
||||
test_funcs). args is a dict of keyword arguments to pass to
|
||||
the identify method. test_funcs are callables that accept a
|
||||
Metadata object and return True iff the object passes the
|
||||
test.
|
||||
'''
|
||||
plugin = None
|
||||
for x in metadata_plugins(['identify']):
|
||||
if x.name == name:
|
||||
plugin = x
|
||||
break
|
||||
prints('Testing the identify function of', plugin.name)
|
||||
|
||||
tdir = tempfile.gettempdir()
|
||||
lf = os.path.join(tdir, plugin.name.replace(' ', '')+'_identify_test.txt')
|
||||
log = create_log(open(lf, 'wb'))
|
||||
abort = Event()
|
||||
prints('Log saved to', lf)
|
||||
|
||||
for kwargs, test_funcs in tests:
|
||||
prints('Running test with:', kwargs)
|
||||
rq = Queue()
|
||||
args = (log, rq, abort)
|
||||
err = plugin.identify(*args, **kwargs)
|
||||
if err is not None:
|
||||
prints('identify returned an error for args', args)
|
||||
prints(err)
|
||||
break
|
||||
|
||||
results = []
|
||||
while True:
|
||||
try:
|
||||
results.append(rq.get_nowait())
|
||||
except Empty:
|
||||
break
|
||||
|
||||
prints('Found', len(results), 'matches:')
|
||||
|
||||
for mi in results:
|
||||
prints(mi)
|
||||
prints('\n\n')
|
||||
|
||||
match_found = None
|
||||
for mi in results:
|
||||
test_failed = False
|
||||
for tfunc in test_funcs:
|
||||
if not tfunc(mi):
|
||||
test_failed = True
|
||||
break
|
||||
if not test_failed:
|
||||
match_found = mi
|
||||
break
|
||||
|
||||
if match_found is None:
|
||||
prints('ERROR: No results that passed all tests were found')
|
||||
prints('Log saved to', lf)
|
||||
raise SystemExit(1)
|
||||
|
||||
prints('Log saved to', lf)
|
||||
|
@ -10,17 +10,19 @@ INFO = 1
|
||||
WARN = 2
|
||||
ERROR = 3
|
||||
|
||||
import sys, traceback
|
||||
import sys, traceback, cStringIO
|
||||
from functools import partial
|
||||
|
||||
from threading import RLock
|
||||
|
||||
|
||||
|
||||
class Stream(object):
|
||||
|
||||
def __init__(self, stream):
|
||||
def __init__(self, stream=None):
|
||||
from calibre import prints
|
||||
self._prints = partial(prints, safe_encode=True)
|
||||
if stream is None:
|
||||
stream = cStringIO.StringIO()
|
||||
self.stream = stream
|
||||
|
||||
def flush(self):
|
||||
@ -50,6 +52,15 @@ class ANSIStream(Stream):
|
||||
def flush(self):
|
||||
self.stream.flush()
|
||||
|
||||
class FileStream(Stream):
|
||||
|
||||
def __init__(self, stream=None):
|
||||
Stream.__init__(self, stream)
|
||||
|
||||
def prints(self, level, *args, **kwargs):
|
||||
kwargs['file'] = self.stream
|
||||
self._prints(*args, **kwargs)
|
||||
|
||||
class HTMLStream(Stream):
|
||||
|
||||
def __init__(self, stream=sys.stdout):
|
||||
@ -103,4 +114,14 @@ class Log(object):
|
||||
def __call__(self, *args, **kwargs):
|
||||
self.prints(INFO, *args, **kwargs)
|
||||
|
||||
class ThreadSafeLog(Log):
|
||||
|
||||
def __init__(self, level=Log.INFO):
|
||||
Log.__init__(self, level=level)
|
||||
self._lock = RLock()
|
||||
|
||||
def prints(self, *args, **kwargs):
|
||||
with self._lock:
|
||||
Log.prints(self, *args, **kwargs)
|
||||
|
||||
default_log = Log()
|
||||
|
Loading…
x
Reference in New Issue
Block a user