Conversion pipeline framework is finally taking shape

This commit is contained in:
Kovid Goyal 2009-03-10 22:57:12 -07:00
parent 9445f488c2
commit 741d638409
12 changed files with 208 additions and 62 deletions

View File

@ -117,7 +117,11 @@ class InputFormatPlugin(Plugin):
#: instance of :class:`OptionRecommendation`.
options = set([])
def convert(self, stream, options, file_ext, parse_cache, log):
#: A set of 3-tuples of the form
#: (option_name, recommended_value, recommendation_level)
recommendations = set([])
def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
'''
This method must be implemented in sub-classes. It must return
the path to the created OPF file. All output should be contained in
@ -153,10 +157,16 @@ class InputFormatPlugin(Plugin):
:param log: A :class:`calibre.utils.logging.Log` object. All output
should use this object.
:param accelarators: A dictionary of various information that the input
plugin can get easily that would speed up the
subsequent stages of the conversion.
'''
raise NotImplementedError
def __call__(self, stream, options, file_ext, parse_cache, log, output_dir):
def __call__(self, stream, options, file_ext, parse_cache, log,
accelerators, output_dir):
log('InputFormatPlugin: %s running'%self.name, end=' ')
if hasattr(stream, 'name'):
log('on', stream.name)
@ -166,7 +176,8 @@ class InputFormatPlugin(Plugin):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext, parse_cache, log)
ret = self.convert(stream, options, file_ext, parse_cache,
log, accelerators)
for key in list(parse_cache.keys()):
if os.path.abspath(key) != key:
log.warn(('InputFormatPlugin: %s returned a '
@ -221,6 +232,10 @@ class OutputFormatPlugin(Plugin):
#: instance of :class:`OptionRecommendation`.
options = set([])
#: A set of 3-tuples of the form
#: (option_name, recommended_value, recommendation_level)
recommendations = set([])
def convert(self, oeb_book, input_plugin, options, parse_cache, log):
raise NotImplementedError

View File

@ -39,6 +39,7 @@ from optparse import OptionGroup, Option
from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
from calibre.constants import preferred_encoding
from calibre.customize.conversion import OptionRecommendation
def print_help(parser, log):
help = parser.format_help().encode(preferred_encoding, 'replace')
@ -84,16 +85,16 @@ def add_input_output_options(parser, plumber):
option_recommendation_to_cli_option(group, opt)
if input_options:
title = plumber.input_fmt.upper() + ' ' + _('OPTIONS')
title = _('INPUT OPTIONS')
io = OptionGroup(parser, title, _('Options to control the processing'
' of the input file'))
' of the input %s file')%plumber.input_fmt)
add_options(io.add_option, input_options)
parser.add_option_group(io)
if output_options:
title = plumber.output_fmt.upper() + ' ' + _('OPTIONS')
oo = OptionGroup(parser, title, _('Options to control the processing'
' of the output file'))
' of the output %s file')%plumber.input_fmt)
add_options(oo.add_option, output_options)
parser.add_option_group(oo)
@ -106,6 +107,9 @@ def add_pipeline_options(parser, plumber):
]
),
'METADATA' : (_('Options to set metadata in the output'),
plumber.metadata_option_names,
),
'DEBUG': (_('Options to help with debugging the conversion'),
[
'verbose',
@ -114,7 +118,7 @@ def add_pipeline_options(parser, plumber):
}
group_order = ['', 'DEBUG']
group_order = ['', 'METADATA', 'DEBUG']
for group in group_order:
desc, options = groups[group]
@ -147,11 +151,16 @@ def main(args=sys.argv):
add_pipeline_options(parser, plumber)
opts = parser.parse_args(args)[0]
recommendations = [(n.dest, getattr(opts, n.dest)) \
for n in parser.options_iter()]
recommendations = [(n.dest, getattr(opts, n.dest),
OptionRecommendation.HIGH) \
for n in parser.options_iter()
if n.dest]
plumber.merge_ui_recommendations(recommendations)
plumber.run()
log(_('Output saved to'), ' ', plumber.output)
return 0
if __name__ == '__main__':

View File

@ -9,9 +9,23 @@ from calibre.customize.conversion import OptionRecommendation
from calibre.customize.ui import input_profiles, output_profiles, \
plugin_for_input_format, plugin_for_output_format
class OptionValues(object):
pass
class Plumber(object):
pipeline_options = [
metadata_option_names = [
'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments',
'publisher', 'series', 'series_index', 'rating', 'isbn',
'tags', 'book_producer', 'language'
]
def __init__(self, input, output, log):
self.input = input
self.output = output
self.log = log
self.pipeline_options = [
OptionRecommendation(name='verbose',
recommended_value=0, level=OptionRecommendation.LOW,
@ -40,13 +54,72 @@ OptionRecommendation(name='output_profile',
'will work on a device. For example EPUB on the SONY reader.'
)
),
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
short_switch='m',
help=_('Read metadata from the specified OPF file. Metadata read '
'from this file will override any metadata in the source '
'file.')
),
OptionRecommendation(name='title',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the title.')),
OptionRecommendation(name='authors',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the authors. Multiple authors should be separated ')),
OptionRecommendation(name='title_sort',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('The version of the title to be used for sorting. ')),
OptionRecommendation(name='author_sort',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('String to be used when sorting by author. ')),
OptionRecommendation(name='cover',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the cover to the specified file.')),
OptionRecommendation(name='comments',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ebook description.')),
OptionRecommendation(name='publisher',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ebook publisher.')),
OptionRecommendation(name='series',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the series this ebook belongs to.')),
OptionRecommendation(name='series_index',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the index of the book in this series.')),
OptionRecommendation(name='rating',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the rating. Should be a number between 1 and 5.')),
OptionRecommendation(name='isbn',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ISBN of the book.')),
OptionRecommendation(name='tags',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the tags for the book. Should be a comma separated list.')),
OptionRecommendation(name='book_producer',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the book producer.')),
OptionRecommendation(name='language',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the language.')),
]
def __init__(self, input, output, log):
self.input = input
self.output = output
self.log = log
input_fmt = os.path.splitext(input)[1]
if not input_fmt:
@ -85,11 +158,79 @@ OptionRecommendation(name='output_profile',
return rec
def merge_plugin_recommendations(self):
pass
for source in (self.input_plugin, self.output_plugin):
for name, val, level in source.recommendations:
rec = self.get_option_by_name(name)
if rec is not None and rec.level <= level:
rec.recommended_value = val
def merge_ui_recommendations(self, recommendations):
pass
for name, val, level in recommendations:
rec = self.get_option_by_name(name)
if rec is not None and rec.level <= level and rec.level < rec.HIGH:
rec.recommended_value = val
def read_user_metadata(self):
from calibre.ebooks.metadata import MetaInformation, string_to_authors
from calibre.ebooks.metadata.opf2 import OPF
mi = MetaInformation(None, [])
if self.opts.read_metadata_from_opf is not None:
self.opts.read_metadata_from_opf = os.path.abspath(
self.opts.read_metadata_from_opf)
opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'),
os.path.dirname(self.opts.read_metadata_from_opf))
mi = MetaInformation(opf)
for x in self.metadata_option_names:
val = getattr(self.opts, x, None)
if val is not None:
if x == 'authors':
val = string_to_authors(val)
elif x == 'tags':
val = [i.strip() for i in val.split(',')]
elif x in ('rating', 'series_index'):
val = float(val)
setattr(mi, x, val)
if mi.cover:
mi.cover_data = ('', open(mi.cover, 'rb').read())
mi.cover = None
self.user_metadata = mi
def setup_options(self):
self.opts = OptionValues()
for group in (self.input_options, self.pipeline_options,
self.output_options):
for rec in group:
setattr(self.opts, rec.option.name, rec.recommended_value)
for x in input_profiles():
if x.short_name == self.opts.input_profile:
self.opts.input_profile = x
break
for x in output_profiles():
if x.short_name == self.opts.output_profile:
self.opts.output_profile = x
break
self.read_user_metadata()
def run(self):
self.setup_options()
from calibre.customize.ui import run_plugins_on_preprocess
self.input = run_plugins_on_preprocess(self.input)
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
parse_cache, accelerators = {}, {}
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, parse_cache, self.log,
accelerators)
self.reader = OEBReader()
self.oeb = OEBBook(self.log, parse_cache=parse_cache)
self.reader(self.oeb, opfpath)

View File

@ -51,7 +51,8 @@ class EPUBInput(InputFormatPlugin):
traceback.print_exc()
return False
def convert(self, stream, options, file_ext, parse_cache, log):
def convert(self, stream, options, file_ext, parse_cache, log,
accelerators):
from calibre.utils.zipfile import ZipFile
from calibre import walk
from calibre.ebooks import DRMError

View File

@ -12,7 +12,8 @@ class MOBIInput(InputFormatPlugin):
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = set(['mobi', 'prc', 'azw'])
def convert(self, stream, options, file_ext, parse_cache, log):
def convert(self, stream, options, file_ext, parse_cache, log,
accelerators):
from calibre.ebooks.mobi.reader import MobiReader
mr = MobiReader(stream, log, options.input_encoding,
options.debug_input)
@ -22,5 +23,8 @@ class MOBIInput(InputFormatPlugin):
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open('debug-raw.html', 'wb').write(raw)
for f, root in parse_cache.items():
if '.' in f:
accelerators[f] = {'pagebreaks':root.xpath(
'//div[@class="mbp_pagebreak"]')}
return mr.created_opf_path

View File

@ -9,7 +9,6 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import sys
import os
from struct import pack
import functools
import time
import random
from cStringIO import StringIO
@ -18,11 +17,10 @@ from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
import logging
from lxml import etree
from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname
from calibre.ebooks.oeb.base import namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import os, sys, re, uuid
import os, re, uuid
from mimetypes import types_map
from collections import defaultdict
from itertools import count
@ -203,14 +203,6 @@ class OEBError(Exception):
"""Generic OEB-processing error."""
pass
class FauxLogger(object):
"""Fake logging interface."""
def __getattr__(self, name):
return self
def __call__(self, message):
print message
class NullContainer(object):
"""An empty container.
@ -1224,16 +1216,20 @@ class PageList(object):
class OEBBook(object):
"""Representation of a book in the IDPF OEB data model."""
def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()):
def __init__(self, logger, parse_cache={}, encoding='utf-8',
pretty_print=False):
"""Create empty book. Optional arguments:
:param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
paths to te cached files and values are lxml root objects and
cssutils stylesheets.
:param:`encoding`: Default encoding for textual content read
from an external container.
:param:`pretty_print`: Whether or not the canonical string form
of XML markup is pretty-printed.
:prama:`logger`: A Logger object to use for logging all messages
:param:`logger`: A Log object to use for logging all messages
related to the processing of this book. It is accessible
via the instance data member :attr:`logger`.
via the instance data members :attr:`logger,log`.
It provides the following public instance data members for
accessing various parts of the OEB data model:
@ -1251,7 +1247,7 @@ class OEBBook(object):
"""
self.encoding = encoding
self.pretty_print = pretty_print
self.logger = logger
self.logger = self.log = logger
self.version = '2.0'
self.container = NullContainer()
self.metadata = Metadata(self)

View File

@ -19,9 +19,9 @@ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath
from calibre.ebooks.oeb.base import urlnormalize, xml2str
from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
urlnormalize, BINARY_MIME, \
OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.metadata.epub import CoverRenderer
@ -45,9 +45,6 @@ class OEBReader(object):
TRANSFORMS = []
"""List of transforms to apply to content read with this Reader."""
def __init__(self):
return
@classmethod
def config(cls, cfg):
"""Add any book-reading options to the :class:`Config` object
@ -65,7 +62,7 @@ class OEBReader(object):
:param:`oeb`.
"""
self.oeb = oeb
self.logger = oeb.logger
self.logger = self.log = oeb.logger
oeb.container = self.Container(path)
opf = self._read_opf()
self._all_from_opf(opf)

View File

@ -6,18 +6,14 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
import re
import operator
import math
from itertools import chain
from collections import defaultdict
from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.stylizer import Stylizer
COLLAPSE = re.compile(r'[ \t\r\n\v]+')

View File

@ -6,9 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from lxml import etree
from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS
from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME
from calibre.ebooks.oeb.base import element

View File

@ -6,13 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
import re
import operator
import math
from itertools import chain
from collections import defaultdict
from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import CSS_MIME

View File

@ -6,7 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from urlparse import urldefrag
import base64
@ -20,9 +19,9 @@ from PyQt4.QtGui import QImage
from PyQt4.QtGui import QPainter
from PyQt4.QtSvg import QSvgRenderer
from PyQt4.QtGui import QApplication
from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename
from calibre.ebooks.oeb.base import XHTML, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
@ -88,7 +87,7 @@ class SVGRasterizer(object):
hrefs = self.oeb.manifest.hrefs
for elem in xpath(svg, '//svg:*[@xl:href]'):
href = urlnormalize(elem.attrib[XLINK('href')])
path, frag = urldefrag(href)
path = urldefrag(href)[0]
if not path:
continue
abshref = item.abshref(path)