Conversion pipeline framework is finally taking shape

This commit is contained in:
Kovid Goyal 2009-03-10 22:57:12 -07:00
parent 9445f488c2
commit 741d638409
12 changed files with 208 additions and 62 deletions

View File

@ -117,7 +117,11 @@ class InputFormatPlugin(Plugin):
#: instance of :class:`OptionRecommendation`. #: instance of :class:`OptionRecommendation`.
options = set([]) options = set([])
def convert(self, stream, options, file_ext, parse_cache, log): #: A set of 3-tuples of the form
#: (option_name, recommended_value, recommendation_level)
recommendations = set([])
def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
''' '''
This method must be implemented in sub-classes. It must return This method must be implemented in sub-classes. It must return
the path to the created OPF file. All output should be contained in the path to the created OPF file. All output should be contained in
@ -153,10 +157,16 @@ class InputFormatPlugin(Plugin):
:param log: A :class:`calibre.utils.logging.Log` object. All output :param log: A :class:`calibre.utils.logging.Log` object. All output
should use this object. should use this object.
:param accelarators: A dictionary of various information that the input
plugin can get easily that would speed up the
subsequent stages of the conversion.
''' '''
raise NotImplementedError raise NotImplementedError
def __call__(self, stream, options, file_ext, parse_cache, log, output_dir): def __call__(self, stream, options, file_ext, parse_cache, log,
accelerators, output_dir):
log('InputFormatPlugin: %s running'%self.name, end=' ') log('InputFormatPlugin: %s running'%self.name, end=' ')
if hasattr(stream, 'name'): if hasattr(stream, 'name'):
log('on', stream.name) log('on', stream.name)
@ -166,7 +176,8 @@ class InputFormatPlugin(Plugin):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
ret = self.convert(stream, options, file_ext, parse_cache, log) ret = self.convert(stream, options, file_ext, parse_cache,
log, accelerators)
for key in list(parse_cache.keys()): for key in list(parse_cache.keys()):
if os.path.abspath(key) != key: if os.path.abspath(key) != key:
log.warn(('InputFormatPlugin: %s returned a ' log.warn(('InputFormatPlugin: %s returned a '
@ -221,6 +232,10 @@ class OutputFormatPlugin(Plugin):
#: instance of :class:`OptionRecommendation`. #: instance of :class:`OptionRecommendation`.
options = set([]) options = set([])
#: A set of 3-tuples of the form
#: (option_name, recommended_value, recommendation_level)
recommendations = set([])
def convert(self, oeb_book, input_plugin, options, parse_cache, log): def convert(self, oeb_book, input_plugin, options, parse_cache, log):
raise NotImplementedError raise NotImplementedError

View File

@ -39,6 +39,7 @@ from optparse import OptionGroup, Option
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.logging import Log from calibre.utils.logging import Log
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
from calibre.customize.conversion import OptionRecommendation
def print_help(parser, log): def print_help(parser, log):
help = parser.format_help().encode(preferred_encoding, 'replace') help = parser.format_help().encode(preferred_encoding, 'replace')
@ -84,16 +85,16 @@ def add_input_output_options(parser, plumber):
option_recommendation_to_cli_option(group, opt) option_recommendation_to_cli_option(group, opt)
if input_options: if input_options:
title = plumber.input_fmt.upper() + ' ' + _('OPTIONS') title = _('INPUT OPTIONS')
io = OptionGroup(parser, title, _('Options to control the processing' io = OptionGroup(parser, title, _('Options to control the processing'
' of the input file')) ' of the input %s file')%plumber.input_fmt)
add_options(io.add_option, input_options) add_options(io.add_option, input_options)
parser.add_option_group(io) parser.add_option_group(io)
if output_options: if output_options:
title = plumber.output_fmt.upper() + ' ' + _('OPTIONS') title = plumber.output_fmt.upper() + ' ' + _('OPTIONS')
oo = OptionGroup(parser, title, _('Options to control the processing' oo = OptionGroup(parser, title, _('Options to control the processing'
' of the output file')) ' of the output %s file')%plumber.input_fmt)
add_options(oo.add_option, output_options) add_options(oo.add_option, output_options)
parser.add_option_group(oo) parser.add_option_group(oo)
@ -106,6 +107,9 @@ def add_pipeline_options(parser, plumber):
] ]
), ),
'METADATA' : (_('Options to set metadata in the output'),
plumber.metadata_option_names,
),
'DEBUG': (_('Options to help with debugging the conversion'), 'DEBUG': (_('Options to help with debugging the conversion'),
[ [
'verbose', 'verbose',
@ -114,7 +118,7 @@ def add_pipeline_options(parser, plumber):
} }
group_order = ['', 'DEBUG'] group_order = ['', 'METADATA', 'DEBUG']
for group in group_order: for group in group_order:
desc, options = groups[group] desc, options = groups[group]
@ -147,11 +151,16 @@ def main(args=sys.argv):
add_pipeline_options(parser, plumber) add_pipeline_options(parser, plumber)
opts = parser.parse_args(args)[0] opts = parser.parse_args(args)[0]
recommendations = [(n.dest, getattr(opts, n.dest)) \ recommendations = [(n.dest, getattr(opts, n.dest),
for n in parser.options_iter()] OptionRecommendation.HIGH) \
for n in parser.options_iter()
if n.dest]
plumber.merge_ui_recommendations(recommendations) plumber.merge_ui_recommendations(recommendations)
plumber.run()
log(_('Output saved to'), ' ', plumber.output)
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -9,9 +9,23 @@ from calibre.customize.conversion import OptionRecommendation
from calibre.customize.ui import input_profiles, output_profiles, \ from calibre.customize.ui import input_profiles, output_profiles, \
plugin_for_input_format, plugin_for_output_format plugin_for_input_format, plugin_for_output_format
class OptionValues(object):
pass
class Plumber(object): class Plumber(object):
pipeline_options = [ metadata_option_names = [
'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments',
'publisher', 'series', 'series_index', 'rating', 'isbn',
'tags', 'book_producer', 'language'
]
def __init__(self, input, output, log):
self.input = input
self.output = output
self.log = log
self.pipeline_options = [
OptionRecommendation(name='verbose', OptionRecommendation(name='verbose',
recommended_value=0, level=OptionRecommendation.LOW, recommended_value=0, level=OptionRecommendation.LOW,
@ -40,13 +54,72 @@ OptionRecommendation(name='output_profile',
'will work on a device. For example EPUB on the SONY reader.' 'will work on a device. For example EPUB on the SONY reader.'
) )
), ),
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
short_switch='m',
help=_('Read metadata from the specified OPF file. Metadata read '
'from this file will override any metadata in the source '
'file.')
),
OptionRecommendation(name='title',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the title.')),
OptionRecommendation(name='authors',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the authors. Multiple authors should be separated ')),
OptionRecommendation(name='title_sort',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('The version of the title to be used for sorting. ')),
OptionRecommendation(name='author_sort',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('String to be used when sorting by author. ')),
OptionRecommendation(name='cover',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the cover to the specified file.')),
OptionRecommendation(name='comments',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ebook description.')),
OptionRecommendation(name='publisher',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ebook publisher.')),
OptionRecommendation(name='series',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the series this ebook belongs to.')),
OptionRecommendation(name='series_index',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the index of the book in this series.')),
OptionRecommendation(name='rating',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the rating. Should be a number between 1 and 5.')),
OptionRecommendation(name='isbn',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the ISBN of the book.')),
OptionRecommendation(name='tags',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the tags for the book. Should be a comma separated list.')),
OptionRecommendation(name='book_producer',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the book producer.')),
OptionRecommendation(name='language',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the language.')),
] ]
def __init__(self, input, output, log):
self.input = input
self.output = output
self.log = log
input_fmt = os.path.splitext(input)[1] input_fmt = os.path.splitext(input)[1]
if not input_fmt: if not input_fmt:
@ -85,11 +158,79 @@ OptionRecommendation(name='output_profile',
return rec return rec
def merge_plugin_recommendations(self): def merge_plugin_recommendations(self):
pass for source in (self.input_plugin, self.output_plugin):
for name, val, level in source.recommendations:
rec = self.get_option_by_name(name)
if rec is not None and rec.level <= level:
rec.recommended_value = val
def merge_ui_recommendations(self, recommendations): def merge_ui_recommendations(self, recommendations):
pass for name, val, level in recommendations:
rec = self.get_option_by_name(name)
if rec is not None and rec.level <= level and rec.level < rec.HIGH:
rec.recommended_value = val
def read_user_metadata(self):
from calibre.ebooks.metadata import MetaInformation, string_to_authors
from calibre.ebooks.metadata.opf2 import OPF
mi = MetaInformation(None, [])
if self.opts.read_metadata_from_opf is not None:
self.opts.read_metadata_from_opf = os.path.abspath(
self.opts.read_metadata_from_opf)
opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'),
os.path.dirname(self.opts.read_metadata_from_opf))
mi = MetaInformation(opf)
for x in self.metadata_option_names:
val = getattr(self.opts, x, None)
if val is not None:
if x == 'authors':
val = string_to_authors(val)
elif x == 'tags':
val = [i.strip() for i in val.split(',')]
elif x in ('rating', 'series_index'):
val = float(val)
setattr(mi, x, val)
if mi.cover:
mi.cover_data = ('', open(mi.cover, 'rb').read())
mi.cover = None
self.user_metadata = mi
def setup_options(self):
self.opts = OptionValues()
for group in (self.input_options, self.pipeline_options,
self.output_options):
for rec in group:
setattr(self.opts, rec.option.name, rec.recommended_value)
for x in input_profiles():
if x.short_name == self.opts.input_profile:
self.opts.input_profile = x
break
for x in output_profiles():
if x.short_name == self.opts.output_profile:
self.opts.output_profile = x
break
self.read_user_metadata()
def run(self):
self.setup_options()
from calibre.customize.ui import run_plugins_on_preprocess
self.input = run_plugins_on_preprocess(self.input)
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
parse_cache, accelerators = {}, {}
opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, parse_cache, self.log,
accelerators)
self.reader = OEBReader()
self.oeb = OEBBook(self.log, parse_cache=parse_cache)
self.reader(self.oeb, opfpath)

View File

@ -51,7 +51,8 @@ class EPUBInput(InputFormatPlugin):
traceback.print_exc() traceback.print_exc()
return False return False
def convert(self, stream, options, file_ext, parse_cache, log): def convert(self, stream, options, file_ext, parse_cache, log,
accelerators):
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from calibre import walk from calibre import walk
from calibre.ebooks import DRMError from calibre.ebooks import DRMError

View File

@ -12,7 +12,8 @@ class MOBIInput(InputFormatPlugin):
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML' description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = set(['mobi', 'prc', 'azw']) file_types = set(['mobi', 'prc', 'azw'])
def convert(self, stream, options, file_ext, parse_cache, log): def convert(self, stream, options, file_ext, parse_cache, log,
accelerators):
from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.reader import MobiReader
mr = MobiReader(stream, log, options.input_encoding, mr = MobiReader(stream, log, options.input_encoding,
options.debug_input) options.debug_input)
@ -22,5 +23,8 @@ class MOBIInput(InputFormatPlugin):
if isinstance(raw, unicode): if isinstance(raw, unicode):
raw = raw.encode('utf-8') raw = raw.encode('utf-8')
open('debug-raw.html', 'wb').write(raw) open('debug-raw.html', 'wb').write(raw)
for f, root in parse_cache.items():
if '.' in f:
accelerators[f] = {'pagebreaks':root.xpath(
'//div[@class="mbp_pagebreak"]')}
return mr.created_opf_path return mr.created_opf_path

View File

@ -9,7 +9,6 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import sys import sys
import os import os
from struct import pack from struct import pack
import functools
import time import time
import random import random
from cStringIO import StringIO from cStringIO import StringIO
@ -18,11 +17,10 @@ from itertools import izip, count
from collections import defaultdict from collections import defaultdict
from urlparse import urldefrag from urlparse import urldefrag
import logging import logging
from lxml import etree
from PIL import Image from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname from calibre.ebooks.oeb.base import namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.profile import Context

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, sys, re, uuid import os, re, uuid
from mimetypes import types_map from mimetypes import types_map
from collections import defaultdict from collections import defaultdict
from itertools import count from itertools import count
@ -203,14 +203,6 @@ class OEBError(Exception):
"""Generic OEB-processing error.""" """Generic OEB-processing error."""
pass pass
class FauxLogger(object):
"""Fake logging interface."""
def __getattr__(self, name):
return self
def __call__(self, message):
print message
class NullContainer(object): class NullContainer(object):
"""An empty container. """An empty container.
@ -1224,16 +1216,20 @@ class PageList(object):
class OEBBook(object): class OEBBook(object):
"""Representation of a book in the IDPF OEB data model.""" """Representation of a book in the IDPF OEB data model."""
def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): def __init__(self, logger, parse_cache={}, encoding='utf-8',
pretty_print=False):
"""Create empty book. Optional arguments: """Create empty book. Optional arguments:
:param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
paths to te cached files and values are lxml root objects and
cssutils stylesheets.
:param:`encoding`: Default encoding for textual content read :param:`encoding`: Default encoding for textual content read
from an external container. from an external container.
:param:`pretty_print`: Whether or not the canonical string form :param:`pretty_print`: Whether or not the canonical string form
of XML markup is pretty-printed. of XML markup is pretty-printed.
:prama:`logger`: A Logger object to use for logging all messages :param:`logger`: A Log object to use for logging all messages
related to the processing of this book. It is accessible related to the processing of this book. It is accessible
via the instance data member :attr:`logger`. via the instance data members :attr:`logger,log`.
It provides the following public instance data members for It provides the following public instance data members for
accessing various parts of the OEB data model: accessing various parts of the OEB data model:
@ -1251,7 +1247,7 @@ class OEBBook(object):
""" """
self.encoding = encoding self.encoding = encoding
self.pretty_print = pretty_print self.pretty_print = pretty_print
self.logger = logger self.logger = self.log = logger
self.version = '2.0' self.version = '2.0'
self.container = NullContainer() self.container = NullContainer()
self.metadata = Metadata(self) self.metadata = Metadata(self)

View File

@ -19,9 +19,9 @@ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
from calibre.ebooks.oeb.base import urlnormalize, xml2str urlnormalize, BINARY_MIME, \
from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.metadata.epub import CoverRenderer from calibre.ebooks.metadata.epub import CoverRenderer
@ -45,9 +45,6 @@ class OEBReader(object):
TRANSFORMS = [] TRANSFORMS = []
"""List of transforms to apply to content read with this Reader.""" """List of transforms to apply to content read with this Reader."""
def __init__(self):
return
@classmethod @classmethod
def config(cls, cfg): def config(cls, cfg):
"""Add any book-reading options to the :class:`Config` object """Add any book-reading options to the :class:`Config` object
@ -65,7 +62,7 @@ class OEBReader(object):
:param:`oeb`. :param:`oeb`.
""" """
self.oeb = oeb self.oeb = oeb
self.logger = oeb.logger self.logger = self.log = oeb.logger
oeb.container = self.Container(path) oeb.container = self.Container(path)
opf = self._read_opf() opf = self._read_opf()
self._all_from_opf(opf) self._all_from_opf(opf)

View File

@ -6,18 +6,14 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
import re import re
import operator import operator
import math import math
from itertools import chain
from collections import defaultdict from collections import defaultdict
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import namespace, barename from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
COLLAPSE = re.compile(r'[ \t\r\n\v]+') COLLAPSE = re.compile(r'[ \t\r\n\v]+')

View File

@ -6,9 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from lxml import etree
from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS
from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME
from calibre.ebooks.oeb.base import element from calibre.ebooks.oeb.base import element

View File

@ -6,13 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
import re
import operator
import math
from itertools import chain
from collections import defaultdict
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XHTML_NS from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import CSS_MIME from calibre.ebooks.oeb.base import CSS_MIME

View File

@ -6,7 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os import os
from urlparse import urldefrag from urlparse import urldefrag
import base64 import base64
@ -20,9 +19,9 @@ from PyQt4.QtGui import QImage
from PyQt4.QtGui import QPainter from PyQt4.QtGui import QPainter
from PyQt4.QtSvg import QSvgRenderer from PyQt4.QtSvg import QSvgRenderer
from PyQt4.QtGui import QApplication from PyQt4.QtGui import QApplication
from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK from calibre.ebooks.oeb.base import XHTML, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename from calibre.ebooks.oeb.base import xml2str, xpath
from calibre.ebooks.oeb.base import urlnormalize from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
@ -88,7 +87,7 @@ class SVGRasterizer(object):
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
for elem in xpath(svg, '//svg:*[@xl:href]'): for elem in xpath(svg, '//svg:*[@xl:href]'):
href = urlnormalize(elem.attrib[XLINK('href')]) href = urlnormalize(elem.attrib[XLINK('href')])
path, frag = urldefrag(href) path = urldefrag(href)[0]
if not path: if not path:
continue continue
abshref = item.abshref(path) abshref = item.abshref(path)