Added LIT input plugin. Ported splitting code now works (at least on the handful of files I've tested)

2025-08-30 23:00:21 -04:00 · 2009-04-18 01:01:18 -07:00 · 2009-04-18 01:01:18 -07:00 · 3e29dfbe56
commit 3e29dfbe56
parent b9f80aa229
13 changed files with 209 additions and 137 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput
 from calibre.ebooks.mobi.input import MOBIInput
 from calibre.ebooks.pdf.input import PDFInput
 from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.lit.input import LITInput
 from calibre.ebooks.html.input import HTMLInput
 from calibre.ebooks.oeb.output import OEBOutput
 from calibre.ebooks.txt.output import TXTOutput
@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput
 from calibre.customize.profiles import input_profiles, output_profiles
 plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
-        TXTInput, OEBOutput, TXTOutput, PDFOutput]
+        TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
                                        x.__name__.endswith('MetadataReader')]
 plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -41,6 +41,11 @@ class ConversionOption(object):
    def __eq__(self, other):
        return hash(self) == hash(other)
    def clone(self):
        return ConversionOption(name=self.name, help=self.help,
                long_switch=self.long_switch, short_switch=self.short_switch,
                choices=self.choices)
 class OptionRecommendation(object):
    LOW  = 1
    MED  = 2
@ -59,6 +64,10 @@ class OptionRecommendation(object):
        self.validate_parameters()
    def clone(self):
        return OptionRecommendation(recommended_value=self.recommended_value,
                level=self.level, option=self.option.clone())
    def validate_parameters(self):
        if self.option.choices and self.recommended_value not in \
                                                    self.option.choices:
@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin):
            options.debug_input = os.path.abspath(options.debug_input)
            if not os.path.exists(options.debug_input):
                os.makedirs(options.debug_input)
            if isinstance(ret, basestring):
                shutil.rmtree(options.debug_input)
                shutil.copytree(output_dir, options.debug_input)
            else:
                from calibre.ebooks.oeb.writer import OEBWriter
                w = OEBWriter(pretty_print=options.pretty_print)
                w(ret, options.debug_input)
            log.info('Input debug saved to:', options.debug_input)
        return ret
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log):
        raise SystemExit(1)
    output = args[2]
-    if output.startswith('.'):
+    if output.startswith('.') and output != '.':
        output = os.path.splitext(os.path.basename(input))[0]+output
    output = os.path.abspath(output)
@ -171,6 +171,7 @@ def main(args=sys.argv):
    plumber.run()
    if plumber.opts.debug_input is None:
        log(_('Output saved to'), ' ', plumber.output)
    return 0
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -32,8 +32,8 @@ class Plumber(object):
        :param input: Path to input file.
        :param output: Path to output file/directory
        '''
-        self.input = input
+        self.input = os.path.abspath(input)
-        self.output = output
+        self.output = os.path.abspath(output)
        self.log = log
        # Initialize the conversion options that are independent of input and
@ -188,15 +188,15 @@ OptionRecommendation(name='language',
 ]
-        input_fmt = os.path.splitext(input)[1]
+        input_fmt = os.path.splitext(self.input)[1]
        if not input_fmt:
            raise ValueError('Input file must have an extension')
        input_fmt = input_fmt[1:].lower()
-        if os.path.exists(output) and os.path.isdir(output):
+        if os.path.exists(self.output) and os.path.isdir(self.output):
            output_fmt = 'oeb'
        else:
-            output_fmt = os.path.splitext(output)[1]
+            output_fmt = os.path.splitext(self.output)[1]
            if not output_fmt:
                output_fmt = '.oeb'
            output_fmt = output_fmt[1:].lower()
@ -323,6 +323,9 @@ OptionRecommendation(name='language',
        self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
                                    self.input_fmt, self.log,
                                    accelerators, tdir)
        if self.opts.debug_input is not None:
            self.log('Debug input called, aborting the rest of the pipeline.')
            return
        if not hasattr(self.oeb, 'manifest'):
            self.oeb = create_oebbook(self.log, self.oeb, self.opts)
@ -365,18 +368,20 @@ OptionRecommendation(name='language',
        self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
                self.opts, self.log)
-def create_oebbook(log, opfpath, opts):
+def create_oebbook(log, path_or_stream, opts, reader=None):
    '''
-    Create an OEBBook from an OPF file.
+    Create an OEBBook.
    '''
    from calibre.ebooks.oeb.reader import OEBReader
    from calibre.ebooks.oeb.base import OEBBook
    html_preprocessor = HTMLPreProcessor()
    reader = OEBReader()
    oeb = OEBBook(log, html_preprocessor=html_preprocessor,
            pretty_print=opts.pretty_print)
    # Read OEB Book into OEBBook
-    log.info('Parsing all content...')
+    log('Parsing all content...')
-    reader(oeb, opfpath)
+    if reader is None:
        from calibre.ebooks.oeb.reader import OEBReader
        reader = OEBReader
    reader()(oeb, path_or_stream)
    return oeb
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin):
                   )
        ),
        OptionRecommendation(name='dont_package',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Normally this input plugin re-arranges all the input '
                'files into a standard folder hierarchy. Only use this option '
                'if you know what you are doing as it can result in various '
                'nasty side effects in the rest of of the conversion pipeline.'
                )
        ),
    ])
    def convert(self, stream, opts, file_ext, log,
@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin):
            mi.render(open('metadata.opf', 'wb'))
            opfpath = os.path.abspath('metadata.opf')
        if opts.dont_package:
            return opfpath
        from calibre.ebooks.conversion.plumber import create_oebbook
        oeb = create_oebbook(log, opfpath, opts)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -0,0 +1,24 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.customize.conversion import InputFormatPlugin
 class LITInput(InputFormatPlugin):
    name        = 'LIT Input'
    author      = 'Marshall T. Vandegrift'
    description = 'Convert LIT files to HTML'
    file_types  = set(['lit'])
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.lit.reader import LitReader
        from calibre.ebooks.conversion.plumber import create_oebbook
        return create_oebbook(log, stream, options, reader=LitReader)
--- a/src/calibre/ebooks/lit/reader.py
+++ b/src/calibre/ebooks/lit/reader.py
@ -7,13 +7,12 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
    'and Marshall T. Vandegrift <llasram@gmail.com>'
-import sys, struct, os
+import struct, os
 import functools
 import re
 from urlparse import urldefrag
 from cStringIO import StringIO
 from urllib import unquote as urlunquote
 from lxml import etree
 from calibre.ebooks.lit import LitError
 from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
 import calibre.ebooks.lit.mssha1 as mssha1
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -272,11 +272,7 @@ def XPath(expr):
 def xpath(elem, expr):
    return elem.xpath(expr, namespaces=XPNSMAP)
 def _prepare_xml_for_serialization(root):
    pass
 def xml2str(root, pretty_print=False, strip_comments=False):
    _prepare_xml_for_serialization(root)
    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
                          pretty_print=pretty_print)
@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False):
 def xml2unicode(root, pretty_print=False):
    _prepare_xml_for_serialization(root)
    return etree.tostring(root, pretty_print=pretty_print)
 ASCII_CHARS   = set(chr(x) for x in xrange(128))
@ -321,6 +316,25 @@ def urlnormalize(href):
    parts = (urlquote(part) for part in parts)
    return urlunparse(parts)
 class DummyHandler(logging.Handler):
    def __init__(self):
        logging.Handler.__init__(self, logging.WARNING)
        self.setFormatter(logging.Formatter('%(message)s'))
        self.log = None
    def emit(self, record):
        if self.log is not None:
            msg = self.format(record)
            f = self.log.error if record.levelno >= logging.ERROR \
                    else self.log.warn
            f(msg)
 _css_logger = logging.getLogger('calibre.css')
 _css_logger.setLevel(logging.WARNING)
 _css_log_handler = DummyHandler()
 _css_logger.addHandler(_css_log_handler)
 class OEBError(Exception):
    """Generic OEB-processing error."""
@ -778,7 +792,8 @@ class Manifest(object):
            data = self.oeb.css_preprocessor(data)
            data = XHTML_CSS_NAMESPACE + data
            parser = CSSParser(loglevel=logging.WARNING,
-                               fetcher=self._fetch_css)
+                               fetcher=self._fetch_css,
                               log=_css_logger)
            data = parser.parseString(data, href=self.href)
            data.namespaces['h'] = XHTML_NS
            return data
@ -1435,7 +1450,7 @@ class OEBBook(object):
        :attr:`pages`: List of "pages," such as indexed to a print edition of
            the same text.
        """
-
+        _css_log_handler.log = logger
        self.encoding = encoding
        self.html_preprocessor = html_preprocessor
        self.css_preprocessor = css_preprocessor
@ -1450,6 +1465,7 @@ class OEBBook(object):
        self.guide = Guide(self)
        self.toc = TOC()
        self.pages = PageList()
        self.auto_generated_toc = True
    @classmethod
    def generate(cls, opts):
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase
 from calibre.customize.ui import available_input_formats
 from calibre.ebooks.epub.from_html import TITLEPAGE
-from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
+from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.zipfile import safe_replace, ZipFile
 from calibre.utils.config import DynamicConfig
 from calibre.utils.logging import Log
 from calibre import CurrentDir
 def character_count(html):
    '''
@ -57,31 +56,21 @@ class FakeOpts(object):
    max_levels = 5
    input_encoding = None
 def html2opf(path, tdir, log):
    from calibre.ebooks.html.input import get_filelist
    from calibre.ebooks.metadata.meta import get_metadata
    with CurrentDir(tdir):
        fl = get_filelist(path, tdir, FakeOpts(), log)
        mi = get_metadata(open(path, 'rb'), 'html')
        mi = OPFCreator(os.getcwdu(), mi)
        mi.guide = None
        entries = [(f.path, 'application/xhtml+xml') for f in fl]
        mi.create_manifest(entries)
        mi.create_spine([f.path for f in fl])
        mi.render(open('metadata.opf', 'wb'))
        opfpath = os.path.abspath('metadata.opf')
    return opfpath
 def opf2opf(path, tdir, opts):
    return path
 def is_supported(path):
    ext = os.path.splitext(path)[1].replace('.', '').lower()
    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
    return ext in available_input_formats()
 def write_oebbook(oeb, path):
    from calibre.ebooks.oeb.writer import OEBWriter
    from calibre import walk
    w = OEBWriter()
    w(oeb, path)
    for f in walk(path):
        if f.endswith('.opf'):
            return f
 class EbookIterator(object):
    CHARACTERS_PER_PAGE = 1000
@ -131,17 +120,16 @@ class EbookIterator(object):
    def __enter__(self):
        self._tdir = TemporaryDirectory('_ebook_iter')
        self.base  = self._tdir.__enter__()
        if self.ebook_ext == 'opf':
            self.pathtoopf = self.pathtoebook
        elif self.ebook_ext == 'html':
            self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log)
        else:
        from calibre.ebooks.conversion.plumber import Plumber
        plumber = Plumber(self.pathtoebook, self.base, self.log)
        plumber.setup_options()
        if hasattr(plumber.opts, 'dont_package'):
            plumber.opts.dont_package = True
        self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
                plumber.opts, plumber.input_fmt, self.log,
                {}, self.base)
        if hasattr(self.pathtoopf, 'manifest'):
            self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir)
        self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
--- a/src/calibre/ebooks/oeb/output.py
+++ b/src/calibre/ebooks/oeb/output.py
@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin):
    author = 'Kovid Goyal'
    file_type = 'oeb'
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        self.log, self.opts = log, opts
        if not os.path.exists(output_path):
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -349,6 +349,7 @@ class OEBReader(object):
    def _toc_from_ncx(self, item):
        if item is None:
            return False
        self.log.debug('Reading TOC from NCX...')
        ncx = item.data
        title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
        title = COLLAPSE_RE.sub(' ', title.strip())
@ -364,6 +365,7 @@ class OEBReader(object):
        result = xpath(opf, 'o2:tours/o2:tour')
        if not result:
            return False
        self.log.debug('Reading TOC from tour...')
        tour = result[0]
        toc = self.oeb.toc
        toc.title = tour.get('title')
@ -384,6 +386,7 @@ class OEBReader(object):
    def _toc_from_html(self, opf):
        if 'toc' not in self.oeb.guide:
            return False
        self.log.debug('Reading TOC from HTML...')
        itempath, frag = urldefrag(self.oeb.guide['toc'].href)
        item = self.oeb.manifest.hrefs[itempath]
        html = item.data
@ -414,6 +417,7 @@ class OEBReader(object):
        return True
    def _toc_from_spine(self, opf):
        self.log.warn('Generating default TOC from spine...')
        toc = self.oeb.toc
        titles = []
        headers = []
@ -441,11 +445,14 @@ class OEBReader(object):
        return True
    def _toc_from_opf(self, opf, item):
        self.oeb.auto_generated_toc = False
        if self._toc_from_ncx(item): return
-        if self._toc_from_tour(opf): return
+        # Prefer HTML to tour based TOC, since several LIT files
-        self.logger.warn('No metadata table of contents found')
+        # have good HTML TOCs but bad tour based TOCs
        if self._toc_from_html(opf): return
        if self._toc_from_tour(opf): return
        self._toc_from_spine(opf)
        self.oeb.auto_generated_toc = True
    def _pages_from_ncx(self, opf, item):
        if item is None:
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -51,8 +51,8 @@ class Split(object):
        self.log = oeb.log
        self.map = {}
        self.page_break_selectors = None
-        for item in self.oeb.manifest.items:
+        for item in list(self.oeb.manifest.items):
-            if etree.iselement(item.data):
+            if item.spine_position is not None and etree.iselement(item.data):
                self.split_item(item)
        self.fix_links()
@ -74,7 +74,6 @@ class Split(object):
            self.page_break_selectors = set([])
            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
                    OEB_STYLES]
        page_break_selectors = set([])
            for rule in rules(stylesheets):
                before = getattr(rule.style.getPropertyCSSValue(
                    'page-break-before'), 'cssText', '').strip().lower()
@ -82,20 +81,24 @@ class Split(object):
                    'page-break-after'), 'cssText', '').strip().lower()
                try:
                    if before and before != 'avoid':
-                    page_break_selectors.add((CSSSelector(rule.selectorText),
+                        self.page_break_selectors.add((CSSSelector(rule.selectorText),
                            True))
                except:
                    pass
                try:
                    if after and after != 'avoid':
-                    page_break_selectors.add((CSSSelector(rule.selectorText),
+                        self.page_break_selectors.add((CSSSelector(rule.selectorText),
                            False))
                except:
                    pass
        page_breaks = set([])
-        for selector, before in page_break_selectors:
+        for selector, before in self.page_break_selectors:
-            for elem in selector(item.data):
+            body = item.data.xpath('//h:body', namespaces=NAMESPACES)
            if not body:
                continue
            for elem in selector(body[0]):
                if elem not in body:
                    if before:
                        elem.set('pb_before', '1')
                    page_breaks.add(elem)
@ -136,8 +139,10 @@ class Split(object):
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]
            nhref = self.current_item.relhref(nhref)
            if frag:
-                nhref = '#'.join(href, frag)
+                nhref = '#'.join((nhref, frag))
            return nhref
        return url
@ -153,7 +158,7 @@ class FlowSplitter(object):
        self.page_breaks    = page_breaks
        self.page_break_ids = page_break_ids
        self.max_flow_size  = max_flow_size
-        self.base           = item.abshref(item.href)
+        self.base           = item.href
        base, ext = os.path.splitext(self.base)
        self.base = base.replace('%', '%%')+'_split_%d'+ext
@ -192,9 +197,9 @@ class FlowSplitter(object):
        self.trees = []
        tree = orig_tree
        for pattern, before in ordered_ids:
            self.log.debug('\t\tSplitting on page-break')
            elem = pattern(tree)
            if elem:
                self.log.debug('\t\tSplitting on page-break')
                before, after = self.do_split(tree, elem[0], before)
                self.trees.append(before)
                tree = after
@ -414,13 +419,14 @@ class FlowSplitter(object):
                elem.attrib.pop(SPLIT_ATTR, None)
                elem.attrib.pop(SPLIT_POINT_ATTR, '0')
-        spine_pos = self.item.spine_pos
+        spine_pos = self.item.spine_position
-        for current, tree in zip(map(reversed, (self.files, self.trees))):
+        for current, tree in zip(*map(reversed, (self.files, self.trees))):
            for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
                href = a.get('href').strip()
                if href.startswith('#'):
                    anchor = href[1:]
                    file = self.anchor_map[anchor]
                    file = self.item.relhref(file)
                    if file != current:
                        a.set('href', file+href)
@ -430,12 +436,12 @@ class FlowSplitter(object):
            self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
        if self.oeb.guide:
-            for ref in self.oeb.guide:
+            for ref in self.oeb.guide.values():
                href, frag = urldefrag(ref.href)
                if href == self.item.href:
                    nhref = self.anchor_map[frag if frag else None]
                    if frag:
-                        nhref = '#'.join(nhref, frag)
+                        nhref = '#'.join((nhref, frag))
                    ref.href = nhref
        def fix_toc_entry(toc):
@ -444,7 +450,7 @@ class FlowSplitter(object):
                if href == self.item.href:
                    nhref = self.anchor_map[frag if frag else None]
                    if frag:
-                        nhref = '#'.join(nhref, frag)
+                        nhref = '#'.join((nhref, frag))
                    toc.href = nhref
            for x in toc:
                fix_toc_entry(x)
--- a/src/calibre/ebooks/oeb/writer.py
+++ b/src/calibre/ebooks/oeb/writer.py
@ -49,7 +49,7 @@ class OEBWriter(object):
    def __call__(self, oeb, path):
        """
-        Read the book in the :class:`OEBBook` object :param:`oeb` to a file
+        Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
        at :param:`path`.
        """
        version = int(self.version[0])