Conversion pipeline framework is finally taking shape

2025-07-09 03:04:10 -04:00 · 2009-03-10 22:57:12 -07:00 · 2009-03-10 22:57:12 -07:00 · 741d638409
commit 741d638409
parent 9445f488c2
12 changed files with 208 additions and 62 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -117,7 +117,11 @@ class InputFormatPlugin(Plugin):
    #: instance of :class:`OptionRecommendation`.  
    options = set([])
-    def convert(self, stream, options, file_ext, parse_cache, log):
+    #: A set of 3-tuples of the form 
    #: (option_name, recommended_value, recommendation_level)
    recommendations = set([])
    def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
        the path to the created OPF file. All output should be contained in 
@ -153,10 +157,16 @@ class InputFormatPlugin(Plugin):
        :param log: A :class:`calibre.utils.logging.Log` object. All output 
                    should use this object.
        :param accelarators: A dictionary of various information that the input
                             plugin can get easily that would speed up the
                             subsequent stages of the conversion.
        '''
        raise NotImplementedError
-    def __call__(self, stream, options, file_ext, parse_cache, log, output_dir):
+    def __call__(self, stream, options, file_ext, parse_cache, log, 
                 accelerators, output_dir):
        log('InputFormatPlugin: %s running'%self.name, end=' ')
        if hasattr(stream, 'name'):
            log('on', stream.name)
@ -166,7 +176,8 @@ class InputFormatPlugin(Plugin):
                shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
-            ret = self.convert(stream, options, file_ext, parse_cache, log)
+            ret = self.convert(stream, options, file_ext, parse_cache, 
                               log, accelerators)
            for key in list(parse_cache.keys()):
                if os.path.abspath(key) != key:
                    log.warn(('InputFormatPlugin: %s returned a '
@ -221,6 +232,10 @@ class OutputFormatPlugin(Plugin):
    #: instance of :class:`OptionRecommendation`.  
    options = set([])
    #: A set of 3-tuples of the form 
    #: (option_name, recommended_value, recommendation_level)
    recommendations = set([])
    def convert(self, oeb_book, input_plugin, options, parse_cache, log):
        raise NotImplementedError
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -39,6 +39,7 @@ from optparse import OptionGroup, Option
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import Log
 from calibre.constants import preferred_encoding
 from calibre.customize.conversion import OptionRecommendation
 def print_help(parser, log):
    help = parser.format_help().encode(preferred_encoding, 'replace')
@ -84,16 +85,16 @@ def add_input_output_options(parser, plumber):
            option_recommendation_to_cli_option(group, opt)
    if input_options:
-        title = plumber.input_fmt.upper() + ' ' + _('OPTIONS')
+        title = _('INPUT OPTIONS')
        io = OptionGroup(parser, title, _('Options to control the processing'
-                                          ' of the input file'))
+                          ' of the input %s file')%plumber.input_fmt)
        add_options(io.add_option, input_options)
        parser.add_option_group(io)
    if output_options:
        title = plumber.output_fmt.upper() + ' ' + _('OPTIONS')
        oo = OptionGroup(parser, title, _('Options to control the processing'
-                                          ' of the output file'))
+                          ' of the output %s file')%plumber.input_fmt)
        add_options(oo.add_option, output_options)
        parser.add_option_group(oo)
@ -106,6 +107,9 @@ def add_pipeline_options(parser, plumber):
                     ]
                    ),
              'METADATA' : (_('Options to set metadata in the output'),
                            plumber.metadata_option_names,
                            ),
              'DEBUG': (_('Options to help with debugging the conversion'),
                        [
                         'verbose',
@ -114,7 +118,7 @@ def add_pipeline_options(parser, plumber):
              }
-    group_order = ['', 'DEBUG']
+    group_order = ['', 'METADATA', 'DEBUG']
    for group in group_order:
        desc, options = groups[group]
@ -147,11 +151,16 @@ def main(args=sys.argv):
    add_pipeline_options(parser, plumber)
    opts = parser.parse_args(args)[0]
-    recommendations = [(n.dest, getattr(opts, n.dest)) \
+    recommendations = [(n.dest, getattr(opts, n.dest), 
-                                        for n in parser.options_iter()]
+                        OptionRecommendation.HIGH) \
-    
+                                        for n in parser.options_iter()
                                        if n.dest]
    plumber.merge_ui_recommendations(recommendations)
    plumber.run()
    log(_('Output saved to'), ' ', plumber.output)
    return 0
 if __name__ == '__main__':
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -9,9 +9,23 @@ from calibre.customize.conversion import OptionRecommendation
 from calibre.customize.ui import input_profiles, output_profiles, \
        plugin_for_input_format, plugin_for_output_format
 class OptionValues(object):
    pass
 class Plumber(object):
-    pipeline_options = [
+    metadata_option_names = [
        'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments', 
        'publisher', 'series', 'series_index', 'rating', 'isbn', 
        'tags', 'book_producer', 'language'
        ]
    def __init__(self, input, output, log):
        self.input = input
        self.output = output
        self.log = log
        self.pipeline_options = [
 OptionRecommendation(name='verbose', 
            recommended_value=0, level=OptionRecommendation.LOW,
@ -40,13 +54,72 @@ OptionRecommendation(name='output_profile',
                   'will work on a device. For example EPUB on the SONY reader.'
                   )
        ),
 OptionRecommendation(name='read_metadata_from_opf', 
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='m', 
            help=_('Read metadata from the specified OPF file. Metadata read '
                   'from this file will override any metadata in the source ' 
                   'file.')
        ),
 OptionRecommendation(name='title',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the title.')),
 OptionRecommendation(name='authors',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the authors. Multiple authors should be separated ')),
 OptionRecommendation(name='title_sort',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('The version of the title to be used for sorting. ')),
 OptionRecommendation(name='author_sort',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('String to be used when sorting by author. ')),
 OptionRecommendation(name='cover',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the cover to the specified file.')),
 OptionRecommendation(name='comments',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the ebook description.')),
 OptionRecommendation(name='publisher',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the ebook publisher.')),
 OptionRecommendation(name='series',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the series this ebook belongs to.')),
 OptionRecommendation(name='series_index',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the index of the book in this series.')),
 OptionRecommendation(name='rating',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the rating. Should be a number between 1 and 5.')),
 OptionRecommendation(name='isbn',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the ISBN of the book.')),
 OptionRecommendation(name='tags',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the tags for the book. Should be a comma separated list.')),
 OptionRecommendation(name='book_producer',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the book producer.')),
 OptionRecommendation(name='language',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the language.')),
 ]
    def __init__(self, input, output, log):
        self.input = input
        self.output = output
        self.log = log
        input_fmt = os.path.splitext(input)[1]
        if not input_fmt:
@ -85,11 +158,79 @@ OptionRecommendation(name='output_profile',
                    return rec
    def merge_plugin_recommendations(self):
-        pass
+        for source in (self.input_plugin, self.output_plugin):
            for name, val, level in source.recommendations:
                rec = self.get_option_by_name(name)
                if rec is not None and rec.level <= level:
                    rec.recommended_value = val
    def merge_ui_recommendations(self, recommendations):
-        pass
+        for name, val, level in recommendations:
            rec = self.get_option_by_name(name)
            if rec is not None and rec.level <= level and rec.level < rec.HIGH:
                rec.recommended_value = val
    def read_user_metadata(self):
        from calibre.ebooks.metadata import MetaInformation, string_to_authors
        from calibre.ebooks.metadata.opf2 import OPF
        mi = MetaInformation(None, [])
        if self.opts.read_metadata_from_opf is not None:
            self.opts.read_metadata_from_opf = os.path.abspath(
                                            self.opts.read_metadata_from_opf)
            opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'),
                      os.path.dirname(self.opts.read_metadata_from_opf))
            mi = MetaInformation(opf)
        for x in self.metadata_option_names:
            val = getattr(self.opts, x, None)
            if val is not None:
                if x == 'authors':
                    val = string_to_authors(val)
                elif x == 'tags':
                    val = [i.strip() for i in val.split(',')]
                elif x in ('rating', 'series_index'):
                    val = float(val)
                setattr(mi, x, val)
        if mi.cover:
            mi.cover_data = ('', open(mi.cover, 'rb').read())
            mi.cover = None
        self.user_metadata = mi
    def setup_options(self):
        self.opts = OptionValues()
        for group in (self.input_options, self.pipeline_options, 
                  self.output_options):
            for rec in group:
                setattr(self.opts, rec.option.name, rec.recommended_value)
        for x in input_profiles():
            if x.short_name == self.opts.input_profile:
                self.opts.input_profile = x
                break
        for x in output_profiles():
            if x.short_name == self.opts.output_profile:
                self.opts.output_profile = x
                break
        self.read_user_metadata()
    def run(self):
        self.setup_options()
        from calibre.customize.ui import run_plugins_on_preprocess
        self.input = run_plugins_on_preprocess(self.input)
        from calibre.ebooks.oeb.reader import OEBReader
        from calibre.ebooks.oeb.base import OEBBook
        parse_cache, accelerators = {}, {}
        opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, 
                                    self.input_fmt, parse_cache, self.log,
                                    accelerators)
        self.reader = OEBReader()
        self.oeb = OEBBook(self.log, parse_cache=parse_cache) 
        self.reader(self.oeb, opfpath)
--- a/src/calibre/ebooks/epub/input.py
+++ b/src/calibre/ebooks/epub/input.py
@ -51,7 +51,8 @@ class EPUBInput(InputFormatPlugin):
            traceback.print_exc()
        return False
-    def convert(self, stream, options, file_ext, parse_cache, log):
+    def convert(self, stream, options, file_ext, parse_cache, log, 
                accelerators):
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -12,7 +12,8 @@ class MOBIInput(InputFormatPlugin):
    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
    file_types  = set(['mobi', 'prc', 'azw'])
-    def convert(self, stream, options, file_ext, parse_cache, log):
+    def convert(self, stream, options, file_ext, parse_cache, log, 
                accelerators):
        from calibre.ebooks.mobi.reader import MobiReader
        mr = MobiReader(stream, log, options.input_encoding, 
                        options.debug_input)
@ -22,5 +23,8 @@ class MOBIInput(InputFormatPlugin):
            if isinstance(raw, unicode):
                raw = raw.encode('utf-8')
            open('debug-raw.html', 'wb').write(raw)
-            
+        for f, root in parse_cache.items():
            if '.' in f:
                accelerators[f] = {'pagebreaks':root.xpath(
                                            '//div[@class="mbp_pagebreak"]')}
        return mr.created_opf_path
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -9,7 +9,6 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
 import sys
 import os
 from struct import pack
 import functools
 import time
 import random
 from cStringIO import StringIO
@ -18,11 +17,10 @@ from itertools import izip, count
 from collections import defaultdict
 from urlparse import urldefrag
 import logging
 from lxml import etree
 from PIL import Image
 from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
    OEB_RASTER_IMAGES
-from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname
+from calibre.ebooks.oeb.base import namespace, prefixname
 from calibre.ebooks.oeb.base import urlnormalize
 from calibre.ebooks.oeb.base import OEBBook
 from calibre.ebooks.oeb.profile import Context
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 __docformat__ = 'restructuredtext en'
-import os, sys, re, uuid
+import os, re, uuid
 from mimetypes import types_map
 from collections import defaultdict
 from itertools import count
@ -203,14 +203,6 @@ class OEBError(Exception):
    """Generic OEB-processing error."""
    pass
 class FauxLogger(object):
    """Fake logging interface."""
    def __getattr__(self, name):
        return self
    def __call__(self, message):
        print message
 class NullContainer(object):
    """An empty container.
@ -1224,16 +1216,20 @@ class PageList(object):
 class OEBBook(object):
    """Representation of a book in the IDPF OEB data model."""
-    def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()):
+    def __init__(self, logger, parse_cache={}, encoding='utf-8', 
                 pretty_print=False):
        """Create empty book.  Optional arguments:
        :param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
            paths to te cached files and values are lxml root objects and
            cssutils stylesheets.
        :param:`encoding`: Default encoding for textual content read
            from an external container.
        :param:`pretty_print`: Whether or not the canonical string form
            of XML markup is pretty-printed.
-        :prama:`logger`: A Logger object to use for logging all messages
+        :param:`logger`: A Log object to use for logging all messages
            related to the processing of this book.  It is accessible
-            via the instance data member :attr:`logger`.
+            via the instance data members :attr:`logger,log`.
        It provides the following public instance data members for
        accessing various parts of the OEB data model:
@ -1251,7 +1247,7 @@ class OEBBook(object):
        """
        self.encoding = encoding
        self.pretty_print = pretty_print
-        self.logger = logger
+        self.logger = self.log = logger
        self.version = '2.0'
        self.container = NullContainer()
        self.metadata = Metadata(self)
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -19,9 +19,9 @@ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
    PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
 from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
    ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
-from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath
+from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
-from calibre.ebooks.oeb.base import urlnormalize, xml2str
+                                    urlnormalize, BINARY_MIME, \
-from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer
+                                    OEBError, OEBBook, DirContainer
 from calibre.ebooks.oeb.writer import OEBWriter
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.metadata.epub import CoverRenderer
@ -45,9 +45,6 @@ class OEBReader(object):
    TRANSFORMS = []
    """List of transforms to apply to content read with this Reader."""
    def __init__(self):
        return
    @classmethod
    def config(cls, cfg):
        """Add any book-reading options to the :class:`Config` object
@ -65,7 +62,7 @@ class OEBReader(object):
        :param:`oeb`.
        """
        self.oeb = oeb
-        self.logger = oeb.logger
+        self.logger = self.log = oeb.logger
        oeb.container = self.Container(path)
        opf = self._read_opf()
        self._all_from_opf(opf)
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@ -6,18 +6,14 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 import sys
 import os
 import re
 import operator
 import math
 from itertools import chain
 from collections import defaultdict
 from lxml import etree
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS
 from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES
 from calibre.ebooks.oeb.base import namespace, barename
 from calibre.ebooks.oeb.base import OEBBook
 from calibre.ebooks.oeb.stylizer import Stylizer
 COLLAPSE = re.compile(r'[ \t\r\n\v]+')
--- a/src/calibre/ebooks/oeb/transforms/htmltoc.py
+++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py
@ -6,9 +6,6 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 import sys
 import os
 from lxml import etree
 from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS
 from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME
 from calibre.ebooks.oeb.base import element
--- a/src/calibre/ebooks/oeb/transforms/manglecase.py
+++ b/src/calibre/ebooks/oeb/transforms/manglecase.py
@ -6,13 +6,6 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 import sys
 import os
 import re
 import operator
 import math
 from itertools import chain
 from collections import defaultdict
 from lxml import etree
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS
 from calibre.ebooks.oeb.base import CSS_MIME
--- a/src/calibre/ebooks/oeb/transforms/rasterize.py
+++ b/src/calibre/ebooks/oeb/transforms/rasterize.py
@ -6,7 +6,6 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 import sys
 import os
 from urlparse import urldefrag
 import base64
@ -20,9 +19,9 @@ from PyQt4.QtGui import QImage
 from PyQt4.QtGui import QPainter
 from PyQt4.QtSvg import QSvgRenderer
 from PyQt4.QtGui import QApplication
-from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK
+from calibre.ebooks.oeb.base import XHTML, XLINK
-from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME
+from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
-from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename
+from calibre.ebooks.oeb.base import xml2str, xpath
 from calibre.ebooks.oeb.base import urlnormalize
 from calibre.ebooks.oeb.stylizer import Stylizer
@ -88,7 +87,7 @@ class SVGRasterizer(object):
        hrefs = self.oeb.manifest.hrefs
        for elem in xpath(svg, '//svg:*[@xl:href]'):
            href = urlnormalize(elem.attrib[XLINK('href')])
-            path, frag = urldefrag(href)
+            path = urldefrag(href)[0]
            if not path:
                continue
            abshref = item.abshref(path)