Conversion pipeline framework is finally taking shape

2025-07-09 03:04:10 -04:00 · 2009-03-10 22:57:12 -07:00 · 2009-03-10 22:57:12 -07:00 · 741d638409
commit 741d638409
parent 9445f488c2
12 changed files with 208 additions and 62 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -117,7 +117,11 @@ class InputFormatPlugin(Plugin):
    #: instance of :class:`OptionRecommendation`.  
    options = set([])
    
-    def convert(self, stream, options, file_ext, parse_cache, log):
+    #: A set of 3-tuples of the form 
+    #: (option_name, recommended_value, recommendation_level)
+    recommendations = set([])
+    
+    def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
        the path to the created OPF file. All output should be contained in 
@ -153,10 +157,16 @@ class InputFormatPlugin(Plugin):
        
        :param log: A :class:`calibre.utils.logging.Log` object. All output 
                    should use this object.
+                    
+        :param accelarators: A dictionary of various information that the input
+                             plugin can get easily that would speed up the
+                             subsequent stages of the conversion.
+                             
        '''
        raise NotImplementedError
    
-    def __call__(self, stream, options, file_ext, parse_cache, log, output_dir):
+    def __call__(self, stream, options, file_ext, parse_cache, log, 
+                 accelerators, output_dir):
        log('InputFormatPlugin: %s running'%self.name, end=' ')
        if hasattr(stream, 'name'):
            log('on', stream.name)
@ -166,7 +176,8 @@ class InputFormatPlugin(Plugin):
                shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
                    
                    
-            ret = self.convert(stream, options, file_ext, parse_cache, log)
+            ret = self.convert(stream, options, file_ext, parse_cache, 
+                               log, accelerators)
            for key in list(parse_cache.keys()):
                if os.path.abspath(key) != key:
                    log.warn(('InputFormatPlugin: %s returned a '
@ -221,6 +232,10 @@ class OutputFormatPlugin(Plugin):
    #: instance of :class:`OptionRecommendation`.  
    options = set([])
    
+    #: A set of 3-tuples of the form 
+    #: (option_name, recommended_value, recommendation_level)
+    recommendations = set([])
+
    def convert(self, oeb_book, input_plugin, options, parse_cache, log):
        raise NotImplementedError
 
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -39,6 +39,7 @@ from optparse import OptionGroup, Option
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import Log
 from calibre.constants import preferred_encoding
+from calibre.customize.conversion import OptionRecommendation

 def print_help(parser, log):
    help = parser.format_help().encode(preferred_encoding, 'replace')
@ -84,16 +85,16 @@ def add_input_output_options(parser, plumber):
            option_recommendation_to_cli_option(group, opt)
            
    if input_options:
-        title = plumber.input_fmt.upper() + ' ' + _('OPTIONS')
+        title = _('INPUT OPTIONS')
        io = OptionGroup(parser, title, _('Options to control the processing'
-                                          ' of the input file'))
+                          ' of the input %s file')%plumber.input_fmt)
        add_options(io.add_option, input_options)
        parser.add_option_group(io)
        
    if output_options:
        title = plumber.output_fmt.upper() + ' ' + _('OPTIONS')
        oo = OptionGroup(parser, title, _('Options to control the processing'
-                                          ' of the output file'))
+                          ' of the output %s file')%plumber.input_fmt)
        add_options(oo.add_option, output_options)
        parser.add_option_group(oo)

@ -106,6 +107,9 @@ def add_pipeline_options(parser, plumber):
                     ]
                    ),
              
+              'METADATA' : (_('Options to set metadata in the output'),
+                            plumber.metadata_option_names,
+                            ),
              'DEBUG': (_('Options to help with debugging the conversion'),
                        [
                         'verbose',
@ -114,7 +118,7 @@ def add_pipeline_options(parser, plumber):
                
              }
    
-    group_order = ['', 'DEBUG']
+    group_order = ['', 'METADATA', 'DEBUG']
    
    for group in group_order:
        desc, options = groups[group]
@ -147,11 +151,16 @@ def main(args=sys.argv):
    add_pipeline_options(parser, plumber)
    
    opts = parser.parse_args(args)[0]
-    recommendations = [(n.dest, getattr(opts, n.dest)) \
-                                        for n in parser.options_iter()]
-    
+    recommendations = [(n.dest, getattr(opts, n.dest), 
+                        OptionRecommendation.HIGH) \
+                                        for n in parser.options_iter()
+                                        if n.dest]
    plumber.merge_ui_recommendations(recommendations)
    
+    plumber.run()
+    
+    log(_('Output saved to'), ' ', plumber.output)
+    
    return 0
    
 if __name__ == '__main__':
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -9,9 +9,23 @@ from calibre.customize.conversion import OptionRecommendation
 from calibre.customize.ui import input_profiles, output_profiles, \
        plugin_for_input_format, plugin_for_output_format

+class OptionValues(object):
+    pass
+
 class Plumber(object):
    
-    pipeline_options = [
+    metadata_option_names = [
+        'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments', 
+        'publisher', 'series', 'series_index', 'rating', 'isbn', 
+        'tags', 'book_producer', 'language'
+        ]
+    
+    def __init__(self, input, output, log):
+        self.input = input
+        self.output = output
+        self.log = log
+        
+        self.pipeline_options = [

 OptionRecommendation(name='verbose', 
            recommended_value=0, level=OptionRecommendation.LOW,
@ -40,13 +54,72 @@ OptionRecommendation(name='output_profile',
                   'will work on a device. For example EPUB on the SONY reader.'
                   )
        ),
+        
+OptionRecommendation(name='read_metadata_from_opf', 
+            recommended_value=None, level=OptionRecommendation.LOW,
+            short_switch='m', 
+            help=_('Read metadata from the specified OPF file. Metadata read '
+                   'from this file will override any metadata in the source ' 
+                   'file.')
+        ),
+        
+OptionRecommendation(name='title',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the title.')),

+OptionRecommendation(name='authors',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the authors. Multiple authors should be separated ')),
+
+OptionRecommendation(name='title_sort',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('The version of the title to be used for sorting. ')),
+
+OptionRecommendation(name='author_sort',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('String to be used when sorting by author. ')),
+
+OptionRecommendation(name='cover',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the cover to the specified file.')),
+
+OptionRecommendation(name='comments',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the ebook description.')),
+
+OptionRecommendation(name='publisher',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the ebook publisher.')),
+
+OptionRecommendation(name='series',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the series this ebook belongs to.')),
+
+OptionRecommendation(name='series_index',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the index of the book in this series.')),
+
+OptionRecommendation(name='rating',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the rating. Should be a number between 1 and 5.')),
+
+OptionRecommendation(name='isbn',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the ISBN of the book.')),
+
+OptionRecommendation(name='tags',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the tags for the book. Should be a comma separated list.')),
+
+OptionRecommendation(name='book_producer',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the book producer.')),
+
+OptionRecommendation(name='language',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Set the language.')),
 ]

-    def __init__(self, input, output, log):
-        self.input = input
-        self.output = output
-        self.log = log
        
        input_fmt = os.path.splitext(input)[1]
        if not input_fmt:
@ -85,11 +158,79 @@ OptionRecommendation(name='output_profile',
                    return rec
        
    def merge_plugin_recommendations(self):
-        pass
+        for source in (self.input_plugin, self.output_plugin):
+            for name, val, level in source.recommendations:
+                rec = self.get_option_by_name(name)
+                if rec is not None and rec.level <= level:
+                    rec.recommended_value = val
    
    def merge_ui_recommendations(self, recommendations):
-        pass
+        for name, val, level in recommendations:
+            rec = self.get_option_by_name(name)
+            if rec is not None and rec.level <= level and rec.level < rec.HIGH:
+                rec.recommended_value = val
    
+    def read_user_metadata(self):
+        from calibre.ebooks.metadata import MetaInformation, string_to_authors
+        from calibre.ebooks.metadata.opf2 import OPF
+        mi = MetaInformation(None, [])
+        if self.opts.read_metadata_from_opf is not None:
+            self.opts.read_metadata_from_opf = os.path.abspath(
+                                            self.opts.read_metadata_from_opf)
+            opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'),
+                      os.path.dirname(self.opts.read_metadata_from_opf))
+            mi = MetaInformation(opf)
+        for x in self.metadata_option_names:
+            val = getattr(self.opts, x, None)
+            if val is not None:
+                if x == 'authors':
+                    val = string_to_authors(val)
+                elif x == 'tags':
+                    val = [i.strip() for i in val.split(',')]
+                elif x in ('rating', 'series_index'):
+                    val = float(val)
+                setattr(mi, x, val)
+        if mi.cover:
+            mi.cover_data = ('', open(mi.cover, 'rb').read())
+            mi.cover = None
+        self.user_metadata = mi
+            
    
+    def setup_options(self):
+        self.opts = OptionValues()
+        for group in (self.input_options, self.pipeline_options, 
+                  self.output_options):
+            for rec in group:
+                setattr(self.opts, rec.option.name, rec.recommended_value)
+                
+        for x in input_profiles():
+            if x.short_name == self.opts.input_profile:
+                self.opts.input_profile = x
+                break
+            
+        for x in output_profiles():
+            if x.short_name == self.opts.output_profile:
+                self.opts.output_profile = x
+                break
+            
+        self.read_user_metadata()
+    
+    def run(self):
+        self.setup_options()
+        from calibre.customize.ui import run_plugins_on_preprocess
+        self.input = run_plugins_on_preprocess(self.input)
+        
+        from calibre.ebooks.oeb.reader import OEBReader
+        from calibre.ebooks.oeb.base import OEBBook
+        parse_cache, accelerators = {}, {}
+        
+        opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, 
+                                    self.input_fmt, parse_cache, self.log,
+                                    accelerators)
+        
+        self.reader = OEBReader()
+        self.oeb = OEBBook(self.log, parse_cache=parse_cache) 
+        self.reader(self.oeb, opfpath)
+        
    
        
--- a/src/calibre/ebooks/epub/input.py
+++ b/src/calibre/ebooks/epub/input.py
@ -51,7 +51,8 @@ class EPUBInput(InputFormatPlugin):
            traceback.print_exc()
        return False

-    def convert(self, stream, options, file_ext, parse_cache, log):
+    def convert(self, stream, options, file_ext, parse_cache, log, 
+                accelerators):
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -12,7 +12,8 @@ class MOBIInput(InputFormatPlugin):
    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
    file_types  = set(['mobi', 'prc', 'azw'])
    
-    def convert(self, stream, options, file_ext, parse_cache, log):
+    def convert(self, stream, options, file_ext, parse_cache, log, 
+                accelerators):
        from calibre.ebooks.mobi.reader import MobiReader
        mr = MobiReader(stream, log, options.input_encoding, 
                        options.debug_input)
@ -22,5 +23,8 @@ class MOBIInput(InputFormatPlugin):
            if isinstance(raw, unicode):
                raw = raw.encode('utf-8')
            open('debug-raw.html', 'wb').write(raw)
-            
+        for f, root in parse_cache.items():
+            if '.' in f:
+                accelerators[f] = {'pagebreaks':root.xpath(
+                                            '//div[@class="mbp_pagebreak"]')}
        return mr.created_opf_path
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -9,7 +9,6 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
 import sys
 import os
 from struct import pack
-import functools
 import time
 import random
 from cStringIO import StringIO
@ -18,11 +17,10 @@ from itertools import izip, count
 from collections import defaultdict
 from urlparse import urldefrag
 import logging
-from lxml import etree
 from PIL import Image
 from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
    OEB_RASTER_IMAGES
-from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname
+from calibre.ebooks.oeb.base import namespace, prefixname
 from calibre.ebooks.oeb.base import urlnormalize
 from calibre.ebooks.oeb.base import OEBBook
 from calibre.ebooks.oeb.profile import Context
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 __docformat__ = 'restructuredtext en'

-import os, sys, re, uuid
+import os, re, uuid
 from mimetypes import types_map
 from collections import defaultdict
 from itertools import count
@ -203,14 +203,6 @@ class OEBError(Exception):
    """Generic OEB-processing error."""
    pass

-
-class FauxLogger(object):
-    """Fake logging interface."""
-    def __getattr__(self, name):
-        return self
-    def __call__(self, message):
-        print message
-
 class NullContainer(object):
    """An empty container.

@ -1224,16 +1216,20 @@ class PageList(object):
 class OEBBook(object):
    """Representation of a book in the IDPF OEB data model."""
    
-    def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()):
+    def __init__(self, logger, parse_cache={}, encoding='utf-8', 
+                 pretty_print=False):
        """Create empty book.  Optional arguments:
        
+        :param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
+            paths to te cached files and values are lxml root objects and
+            cssutils stylesheets.
        :param:`encoding`: Default encoding for textual content read
            from an external container.
        :param:`pretty_print`: Whether or not the canonical string form
            of XML markup is pretty-printed.
-        :prama:`logger`: A Logger object to use for logging all messages
+        :param:`logger`: A Log object to use for logging all messages
            related to the processing of this book.  It is accessible
-            via the instance data member :attr:`logger`.
+            via the instance data members :attr:`logger,log`.
        
        It provides the following public instance data members for
        accessing various parts of the OEB data model:
@ -1251,7 +1247,7 @@ class OEBBook(object):
        """
        self.encoding = encoding
        self.pretty_print = pretty_print
-        self.logger = logger
+        self.logger = self.log = logger
        self.version = '2.0'
        self.container = NullContainer()
        self.metadata = Metadata(self)
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -19,9 +19,9 @@ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
    PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
 from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
    ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
-from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath
-from calibre.ebooks.oeb.base import urlnormalize, xml2str
-from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer
+from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
+                                    urlnormalize, BINARY_MIME, \
+                                    OEBError, OEBBook, DirContainer
 from calibre.ebooks.oeb.writer import OEBWriter
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.metadata.epub import CoverRenderer
@ -45,9 +45,6 @@ class OEBReader(object):
    TRANSFORMS = []
    """List of transforms to apply to content read with this Reader."""

-    def __init__(self):
-        return
-    
    @classmethod
    def config(cls, cfg):
        """Add any book-reading options to the :class:`Config` object
@ -65,7 +62,7 @@ class OEBReader(object):
        :param:`oeb`.
        """
        self.oeb = oeb
-        self.logger = oeb.logger
+        self.logger = self.log = oeb.logger
        oeb.container = self.Container(path)
        opf = self._read_opf()
        self._all_from_opf(opf)
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@ -6,18 +6,14 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

-import sys
-import os
 import re
 import operator
 import math
-from itertools import chain
 from collections import defaultdict
 from lxml import etree
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS
 from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES
 from calibre.ebooks.oeb.base import namespace, barename
-from calibre.ebooks.oeb.base import OEBBook
 from calibre.ebooks.oeb.stylizer import Stylizer

 COLLAPSE = re.compile(r'[ \t\r\n\v]+')
--- a/src/calibre/ebooks/oeb/transforms/htmltoc.py
+++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py
@ -6,9 +6,6 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

-import sys
-import os
-from lxml import etree
 from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS
 from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME
 from calibre.ebooks.oeb.base import element
--- a/src/calibre/ebooks/oeb/transforms/manglecase.py
+++ b/src/calibre/ebooks/oeb/transforms/manglecase.py
@ -6,13 +6,6 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

-import sys
-import os
-import re
-import operator
-import math
-from itertools import chain
-from collections import defaultdict
 from lxml import etree
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS
 from calibre.ebooks.oeb.base import CSS_MIME
--- a/src/calibre/ebooks/oeb/transforms/rasterize.py
+++ b/src/calibre/ebooks/oeb/transforms/rasterize.py
@ -6,7 +6,6 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

-import sys
 import os
 from urlparse import urldefrag
 import base64
@ -20,9 +19,9 @@ from PyQt4.QtGui import QImage
 from PyQt4.QtGui import QPainter
 from PyQt4.QtSvg import QSvgRenderer
 from PyQt4.QtGui import QApplication
-from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK
-from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME
-from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename
+from calibre.ebooks.oeb.base import XHTML, XLINK
+from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
+from calibre.ebooks.oeb.base import xml2str, xpath
 from calibre.ebooks.oeb.base import urlnormalize
 from calibre.ebooks.oeb.stylizer import Stylizer

@ -88,7 +87,7 @@ class SVGRasterizer(object):
        hrefs = self.oeb.manifest.hrefs
        for elem in xpath(svg, '//svg:*[@xl:href]'):
            href = urlnormalize(elem.attrib[XLINK('href')])
-            path, frag = urldefrag(href)
+            path = urldefrag(href)[0]
            if not path:
                continue
            abshref = item.abshref(path)