From 3e29dfbe5682eff555c0d9bf3126f398aacb6ec2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Apr 2009 01:01:18 -0700 Subject: [PATCH] Added LIT input plugin. Ported splitting code now works (at least on the handful of files I've tested) --- src/calibre/customize/builtins.py | 7 +- src/calibre/customize/conversion.py | 19 ++++- src/calibre/ebooks/conversion/cli.py | 5 +- src/calibre/ebooks/conversion/plumber.py | 27 ++++--- src/calibre/ebooks/html/input.py | 11 +++ src/calibre/ebooks/lit/input.py | 24 +++++++ src/calibre/ebooks/lit/reader.py | 83 +++++++++++----------- src/calibre/ebooks/oeb/base.py | 30 ++++++-- src/calibre/ebooks/oeb/iterator.py | 54 ++++++-------- src/calibre/ebooks/oeb/output.py | 1 - src/calibre/ebooks/oeb/reader.py | 11 ++- src/calibre/ebooks/oeb/transforms/split.py | 72 ++++++++++--------- src/calibre/ebooks/oeb/writer.py | 2 +- 13 files changed, 209 insertions(+), 137 deletions(-) create mode 100644 src/calibre/ebooks/lit/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index aa6c003114..08824a3591 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -263,14 +263,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin): def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.mobi import set_metadata set_metadata(stream, mi) - + class PDFMetadataWriter(MetadataWriterPlugin): name = 'Set PDF metadata' file_types = set(['pdf']) description = _('Set metadata in %s files') % 'PDF' author = 'John Schember' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.pdf import set_metadata set_metadata(stream, mi) @@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput +from calibre.ebooks.lit.input import LITInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput @@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, - TXTInput, OEBOutput, TXTOutput, PDFOutput] + TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 77cdb0b7da..b334816adf 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -41,6 +41,11 @@ class ConversionOption(object): def __eq__(self, other): return hash(self) == hash(other) + def clone(self): + return ConversionOption(name=self.name, help=self.help, + long_switch=self.long_switch, short_switch=self.short_switch, + choices=self.choices) + class OptionRecommendation(object): LOW = 1 MED = 2 @@ -59,6 +64,10 @@ class OptionRecommendation(object): self.validate_parameters() + def clone(self): + return OptionRecommendation(recommended_value=self.recommended_value, + level=self.level, option=self.option.clone()) + def validate_parameters(self): if self.option.choices and self.recommended_value not in \ self.option.choices: @@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin): options.debug_input = os.path.abspath(options.debug_input) if not os.path.exists(options.debug_input): os.makedirs(options.debug_input) - shutil.rmtree(options.debug_input) - shutil.copytree(output_dir, options.debug_input) + if isinstance(ret, basestring): + shutil.rmtree(options.debug_input) + shutil.copytree(output_dir, options.debug_input) + else: + from calibre.ebooks.oeb.writer import OEBWriter + w = OEBWriter(pretty_print=options.pretty_print) + w(ret, options.debug_input) + log.info('Input debug saved to:', options.debug_input) return ret diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index a3d57be191..b7336ab30a 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log): raise SystemExit(1) output = args[2] - if output.startswith('.'): + if output.startswith('.') and output != '.': output = os.path.splitext(os.path.basename(input))[0]+output output = os.path.abspath(output) @@ -171,7 +171,8 @@ def main(args=sys.argv): plumber.run() - log(_('Output saved to'), ' ', plumber.output) + if plumber.opts.debug_input is None: + log(_('Output saved to'), ' ', plumber.output) return 0 diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 93fc376bea..1edeed8d9c 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -32,8 +32,8 @@ class Plumber(object): :param input: Path to input file. :param output: Path to output file/directory ''' - self.input = input - self.output = output + self.input = os.path.abspath(input) + self.output = os.path.abspath(output) self.log = log # Initialize the conversion options that are independent of input and @@ -188,15 +188,15 @@ OptionRecommendation(name='language', ] - input_fmt = os.path.splitext(input)[1] + input_fmt = os.path.splitext(self.input)[1] if not input_fmt: raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() - if os.path.exists(output) and os.path.isdir(output): + if os.path.exists(self.output) and os.path.isdir(self.output): output_fmt = 'oeb' else: - output_fmt = os.path.splitext(output)[1] + output_fmt = os.path.splitext(self.output)[1] if not output_fmt: output_fmt = '.oeb' output_fmt = output_fmt[1:].lower() @@ -323,6 +323,9 @@ OptionRecommendation(name='language', self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts, self.input_fmt, self.log, accelerators, tdir) + if self.opts.debug_input is not None: + self.log('Debug input called, aborting the rest of the pipeline.') + return if not hasattr(self.oeb, 'manifest'): self.oeb = create_oebbook(self.log, self.oeb, self.opts) @@ -365,18 +368,20 @@ OptionRecommendation(name='language', self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) -def create_oebbook(log, opfpath, opts): +def create_oebbook(log, path_or_stream, opts, reader=None): ''' - Create an OEBBook from an OPF file. + Create an OEBBook. ''' - from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor() - reader = OEBReader() oeb = OEBBook(log, html_preprocessor=html_preprocessor, pretty_print=opts.pretty_print) # Read OEB Book into OEBBook - log.info('Parsing all content...') - reader(oeb, opfpath) + log('Parsing all content...') + if reader is None: + from calibre.ebooks.oeb.reader import OEBReader + reader = OEBReader + + reader()(oeb, path_or_stream) return oeb diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 5b9a085b1d..252032a23d 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin): ) ), + OptionRecommendation(name='dont_package', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Normally this input plugin re-arranges all the input ' + 'files into a standard folder hierarchy. Only use this option ' + 'if you know what you are doing as it can result in various ' + 'nasty side effects in the rest of of the conversion pipeline.' + ) + ), ]) def convert(self, stream, opts, file_ext, log, @@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin): mi.render(open('metadata.opf', 'wb')) opfpath = os.path.abspath('metadata.opf') + if opts.dont_package: + return opfpath + from calibre.ebooks.conversion.plumber import create_oebbook oeb = create_oebbook(log, opfpath, opts) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py new file mode 100644 index 0000000000..2d726f7eeb --- /dev/null +++ b/src/calibre/ebooks/lit/input.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.customize.conversion import InputFormatPlugin + +class LITInput(InputFormatPlugin): + + name = 'LIT Input' + author = 'Marshall T. Vandegrift' + description = 'Convert LIT files to HTML' + file_types = set(['lit']) + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.ebooks.lit.reader import LitReader + from calibre.ebooks.conversion.plumber import create_oebbook + return create_oebbook(log, stream, options, reader=LitReader) + + diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index f32a65e010..79249fe7c3 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -7,13 +7,12 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' \ 'and Marshall T. Vandegrift ' -import sys, struct, os +import struct, os import functools import re from urlparse import urldefrag from cStringIO import StringIO from urllib import unquote as urlunquote -from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 @@ -29,12 +28,12 @@ __all__ = ["LitReader"] XML_DECL = """ """ OPF_DECL = """ - """ HTML_DECL = """ - """ @@ -73,7 +72,7 @@ def encint(bytes, remaining): val <<= 7 val |= (b & 0x7f) if b & 0x80 == 0: break - return val, bytes[pos:], remaining + return val, bytes[pos:], remaining def msguid(bytes): values = struct.unpack(">(?=>>|[^>])') DOUBLE_ANGLE_RE = re.compile(r'([<>])\1') EMPTY_ATOMS = ({},{}) - + def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map @@ -143,7 +142,7 @@ class UnBinary(object): raw = self.CLOSE_ANGLE_RE.sub(r'>', raw) raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw) self.raw = raw - + def item_path(self, internal_id): try: target = self.manifest[internal_id].path @@ -159,7 +158,7 @@ class UnBinary(object): index += 1 relpath = (['..'] * (len(base) - index)) + target[index:] return '/'.join(relpath) - + def __unicode__(self): return self.raw.decode('utf-8') @@ -172,11 +171,11 @@ class UnBinary(object): in_censorship = is_goingdown = False state = 'text' flags = 0 - + while index < len(bin): c, index = read_utf8_char(bin, index) oc = ord(c) - + if state == 'text': if oc == 0: state = 'get flags' @@ -188,14 +187,14 @@ class UnBinary(object): elif c == '<': c = '<<' buf.write(encode(c)) - + elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' - + elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: @@ -226,7 +225,7 @@ class UnBinary(object): if depth == 0: raise LitError('Extra closing tag') return index - + elif state == 'get attr': in_censorship = False if oc == 0: @@ -265,7 +264,7 @@ class UnBinary(object): state = 'get href length' else: state = 'get value length' - + elif state == 'get value length': if not in_censorship: buf.write('"') @@ -281,7 +280,7 @@ class UnBinary(object): continue if count < 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) - + elif state == 'get value': if count == 0xfffe: if not in_censorship: @@ -301,7 +300,7 @@ class UnBinary(object): buf.write('"') in_censorship = False state = 'get attr' - + elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin)-index: @@ -309,21 +308,21 @@ class UnBinary(object): dynamic_tag += 1 state = 'get custom' tag_name = '' - + elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' - + elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) buf.write(' ') state = 'get custom attr' - + elif state == 'get custom attr': buf.write(encode(c)) count -= 1 @@ -337,7 +336,7 @@ class UnBinary(object): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' - + elif state == 'get href': href += c count -= 1 @@ -350,7 +349,7 @@ class UnBinary(object): buf.write(encode(u'"%s"' % path)) state = 'get attr' return index - + class DirectoryEntry(object): def __init__(self, name, section, offset, size): @@ -358,11 +357,11 @@ class DirectoryEntry(object): self.section = section self.offset = offset self.size = size - + def __repr__(self): return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \ % (repr(self.name), self.section, self.offset, self.size) - + def __str__(self): return repr(self) @@ -382,12 +381,12 @@ class ManifestItem(object): path = os.path.normpath(path).replace('\\', '/') while path.startswith('../'): path = path[3:] self.path = path - + def __eq__(self, other): if hasattr(other, 'internal'): return self.internal == other.internal return self.internal == other - + def __repr__(self): return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \ "offset=%d, root=%r, state=%r)" \ @@ -404,7 +403,7 @@ def preserve(function): self.stream.seek(opos) functools.update_wrapper(wrapper, function) return wrapper - + class LitFile(object): PIECE_SIZE = 16 @@ -438,14 +437,14 @@ class LitFile(object): return self.stream.read(8) return property(fget=fget) magic = magic() - + def version(): def fget(self): self.stream.seek(8) return u32(self.stream.read(4)) return property(fget=fget) version = version() - + def hdr_len(): @preserve def fget(self): @@ -453,7 +452,7 @@ class LitFile(object): return int32(self.stream.read(4)) return property(fget=fget) hdr_len = hdr_len() - + def num_pieces(): @preserve def fget(self): @@ -461,7 +460,7 @@ class LitFile(object): return int32(self.stream.read(4)) return property(fget=fget) num_pieces = num_pieces() - + def sec_hdr_len(): @preserve def fget(self): @@ -469,7 +468,7 @@ class LitFile(object): return int32(self.stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() - + def guid(): @preserve def fget(self): @@ -477,7 +476,7 @@ class LitFile(object): return self.stream.read(16) return property(fget=fget) guid = guid() - + def header(): @preserve def fget(self): @@ -488,7 +487,7 @@ class LitFile(object): return self.stream.read(size) return property(fget=fget) header = header() - + @preserve def __len__(self): self.stream.seek(0, 2) @@ -501,7 +500,7 @@ class LitFile(object): def read_content(self, offset, size): return self.read_raw(self.content_offset + offset, size) - + def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) bytes = self.read_raw(offset, self.sec_hdr_len) @@ -526,12 +525,12 @@ class LitFile(object): if u32(bytes[offset+4+16:]): raise LitError('This file has a 64bit content offset') self.content_offset = u32(bytes[offset+16:]) - self.timestamp = u32(bytes[offset+24:]) + self.timestamp = u32(bytes[offset+24:]) self.language_id = u32(bytes[offset+28:]) offset += 48 if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - + def read_header_pieces(self): src = self.header[self.hdr_len:] for i in xrange(self.num_pieces): @@ -556,7 +555,7 @@ class LitFile(object): self.piece3_guid = piece elif i == 4: self.piece4_guid = piece - + def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') @@ -760,9 +759,9 @@ class LitFile(object): raise LitError("Reset table is too short") if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: raise LitError("Reset table has 64bit value for UCLENGTH") - + result = [] - + window_size = 14 u = u32(control[CONTROL_WINDOW_SIZE:]) while u > 0: @@ -847,13 +846,13 @@ class LitContainer(object): def __init__(self, filename_or_stream): self._litfile = LitFile(filename_or_stream) - + def namelist(self): return self._litfile.paths.keys() def exists(self, name): return urlunquote(name) in self._litfile.paths - + def read(self, name): entry = self._litfile.paths[urlunquote(name)] if name else None if entry is None: @@ -869,7 +868,7 @@ class LitContainer(object): internal = '/'.join(('/data', entry.internal)) content = self._litfile.get_file(internal) return content - + def _read_meta(self): path = 'content.opf' raw = self._litfile.get_file('/meta') diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 5d2c51c4ba..dda36a7500 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -272,11 +272,7 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def _prepare_xml_for_serialization(root): - pass - def xml2str(root, pretty_print=False, strip_comments=False): - _prepare_xml_for_serialization(root) ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=pretty_print) @@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False): def xml2unicode(root, pretty_print=False): - _prepare_xml_for_serialization(root) return etree.tostring(root, pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) @@ -321,6 +316,25 @@ def urlnormalize(href): parts = (urlquote(part) for part in parts) return urlunparse(parts) +class DummyHandler(logging.Handler): + + def __init__(self): + logging.Handler.__init__(self, logging.WARNING) + self.setFormatter(logging.Formatter('%(message)s')) + self.log = None + + def emit(self, record): + if self.log is not None: + msg = self.format(record) + f = self.log.error if record.levelno >= logging.ERROR \ + else self.log.warn + f(msg) + + +_css_logger = logging.getLogger('calibre.css') +_css_logger.setLevel(logging.WARNING) +_css_log_handler = DummyHandler() +_css_logger.addHandler(_css_log_handler) class OEBError(Exception): """Generic OEB-processing error.""" @@ -778,7 +792,8 @@ class Manifest(object): data = self.oeb.css_preprocessor(data) data = XHTML_CSS_NAMESPACE + data parser = CSSParser(loglevel=logging.WARNING, - fetcher=self._fetch_css) + fetcher=self._fetch_css, + log=_css_logger) data = parser.parseString(data, href=self.href) data.namespaces['h'] = XHTML_NS return data @@ -1435,7 +1450,7 @@ class OEBBook(object): :attr:`pages`: List of "pages," such as indexed to a print edition of the same text. """ - + _css_log_handler.log = logger self.encoding = encoding self.html_preprocessor = html_preprocessor self.css_preprocessor = css_preprocessor @@ -1450,6 +1465,7 @@ class OEBBook(object): self.guide = Guide(self) self.toc = TOC() self.pages = PageList() + self.auto_generated_toc = True @classmethod def generate(cls, opts): diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index df4f3b88f1..81e1f89029 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase from calibre.customize.ui import available_input_formats from calibre.ebooks.epub.from_html import TITLEPAGE -from calibre.ebooks.metadata.opf2 import OPF, OPFCreator +from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.config import DynamicConfig from calibre.utils.logging import Log -from calibre import CurrentDir def character_count(html): ''' @@ -57,31 +56,21 @@ class FakeOpts(object): max_levels = 5 input_encoding = None -def html2opf(path, tdir, log): - from calibre.ebooks.html.input import get_filelist - from calibre.ebooks.metadata.meta import get_metadata - with CurrentDir(tdir): - fl = get_filelist(path, tdir, FakeOpts(), log) - mi = get_metadata(open(path, 'rb'), 'html') - mi = OPFCreator(os.getcwdu(), mi) - mi.guide = None - entries = [(f.path, 'application/xhtml+xml') for f in fl] - mi.create_manifest(entries) - mi.create_spine([f.path for f in fl]) - - mi.render(open('metadata.opf', 'wb')) - opfpath = os.path.abspath('metadata.opf') - - return opfpath - -def opf2opf(path, tdir, opts): - return path - def is_supported(path): ext = os.path.splitext(path)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) return ext in available_input_formats() + +def write_oebbook(oeb, path): + from calibre.ebooks.oeb.writer import OEBWriter + from calibre import walk + w = OEBWriter() + w(oeb, path) + for f in walk(path): + if f.endswith('.opf'): + return f + class EbookIterator(object): CHARACTERS_PER_PAGE = 1000 @@ -131,17 +120,16 @@ class EbookIterator(object): def __enter__(self): self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() - if self.ebook_ext == 'opf': - self.pathtoopf = self.pathtoebook - elif self.ebook_ext == 'html': - self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log) - else: - from calibre.ebooks.conversion.plumber import Plumber - plumber = Plumber(self.pathtoebook, self.base, self.log) - plumber.setup_options() - self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), - plumber.opts, plumber.input_fmt, self.log, - {}, self.base) + from calibre.ebooks.conversion.plumber import Plumber + plumber = Plumber(self.pathtoebook, self.base, self.log) + plumber.setup_options() + if hasattr(plumber.opts, 'dont_package'): + plumber.opts.dont_package = True + self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), + plumber.opts, plumber.input_fmt, self.log, + {}, self.base) + if hasattr(self.pathtoopf, 'manifest'): + self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index 480ca3776e..ba62897215 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin): author = 'Kovid Goyal' file_type = 'oeb' - def convert(self, oeb_book, output_path, input_plugin, opts, log): self.log, self.opts = log, opts if not os.path.exists(output_path): diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index faeff4b825..6f0ff44bc9 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -349,6 +349,7 @@ class OEBReader(object): def _toc_from_ncx(self, item): if item is None: return False + self.log.debug('Reading TOC from NCX...') ncx = item.data title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) @@ -364,6 +365,7 @@ class OEBReader(object): result = xpath(opf, 'o2:tours/o2:tour') if not result: return False + self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') @@ -384,6 +386,7 @@ class OEBReader(object): def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False + self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data @@ -414,6 +417,7 @@ class OEBReader(object): return True def _toc_from_spine(self, opf): + self.log.warn('Generating default TOC from spine...') toc = self.oeb.toc titles = [] headers = [] @@ -441,11 +445,14 @@ class OEBReader(object): return True def _toc_from_opf(self, opf, item): + self.oeb.auto_generated_toc = False if self._toc_from_ncx(item): return - if self._toc_from_tour(opf): return - self.logger.warn('No metadata table of contents found') + # Prefer HTML to tour based TOC, since several LIT files + # have good HTML TOCs but bad tour based TOCs if self._toc_from_html(opf): return + if self._toc_from_tour(opf): return self._toc_from_spine(opf) + self.oeb.auto_generated_toc = True def _pages_from_ncx(self, opf, item): if item is None: diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 1bb5b50d06..33ab14b73d 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -51,8 +51,8 @@ class Split(object): self.log = oeb.log self.map = {} self.page_break_selectors = None - for item in self.oeb.manifest.items: - if etree.iselement(item.data): + for item in list(self.oeb.manifest.items): + if item.spine_position is not None and etree.iselement(item.data): self.split_item(item) self.fix_links() @@ -74,31 +74,34 @@ class Split(object): self.page_break_selectors = set([]) stylesheets = [x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES] - page_break_selectors = set([]) - for rule in rules(stylesheets): - before = getattr(rule.style.getPropertyCSSValue( - 'page-break-before'), 'cssText', '').strip().lower() - after = getattr(rule.style.getPropertyCSSValue( - 'page-break-after'), 'cssText', '').strip().lower() - try: - if before and before != 'avoid': - page_break_selectors.add((CSSSelector(rule.selectorText), - True)) - except: - pass - try: - if after and after != 'avoid': - page_break_selectors.add((CSSSelector(rule.selectorText), - False)) - except: - pass + for rule in rules(stylesheets): + before = getattr(rule.style.getPropertyCSSValue( + 'page-break-before'), 'cssText', '').strip().lower() + after = getattr(rule.style.getPropertyCSSValue( + 'page-break-after'), 'cssText', '').strip().lower() + try: + if before and before != 'avoid': + self.page_break_selectors.add((CSSSelector(rule.selectorText), + True)) + except: + pass + try: + if after and after != 'avoid': + self.page_break_selectors.add((CSSSelector(rule.selectorText), + False)) + except: + pass page_breaks = set([]) - for selector, before in page_break_selectors: - for elem in selector(item.data): - if before: - elem.set('pb_before', '1') - page_breaks.add(elem) + for selector, before in self.page_break_selectors: + body = item.data.xpath('//h:body', namespaces=NAMESPACES) + if not body: + continue + for elem in selector(body[0]): + if elem not in body: + if before: + elem.set('pb_before', '1') + page_breaks.add(elem) for i, elem in enumerate(item.data.iter()): elem.set('pb_order', str(i)) @@ -136,8 +139,10 @@ class Split(object): if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] + nhref = self.current_item.relhref(nhref) if frag: - nhref = '#'.join(href, frag) + nhref = '#'.join((nhref, frag)) + return nhref return url @@ -153,7 +158,7 @@ class FlowSplitter(object): self.page_breaks = page_breaks self.page_break_ids = page_break_ids self.max_flow_size = max_flow_size - self.base = item.abshref(item.href) + self.base = item.href base, ext = os.path.splitext(self.base) self.base = base.replace('%', '%%')+'_split_%d'+ext @@ -192,9 +197,9 @@ class FlowSplitter(object): self.trees = [] tree = orig_tree for pattern, before in ordered_ids: - self.log.debug('\t\tSplitting on page-break') elem = pattern(tree) if elem: + self.log.debug('\t\tSplitting on page-break') before, after = self.do_split(tree, elem[0], before) self.trees.append(before) tree = after @@ -414,13 +419,14 @@ class FlowSplitter(object): elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_POINT_ATTR, '0') - spine_pos = self.item.spine_pos - for current, tree in zip(map(reversed, (self.files, self.trees))): + spine_pos = self.item.spine_position + for current, tree in zip(*map(reversed, (self.files, self.trees))): for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): href = a.get('href').strip() if href.startswith('#'): anchor = href[1:] file = self.anchor_map[anchor] + file = self.item.relhref(file) if file != current: a.set('href', file+href) @@ -430,12 +436,12 @@ class FlowSplitter(object): self.oeb.spine.insert(spine_pos, new_item, self.item.linear) if self.oeb.guide: - for ref in self.oeb.guide: + for ref in self.oeb.guide.values(): href, frag = urldefrag(ref.href) if href == self.item.href: nhref = self.anchor_map[frag if frag else None] if frag: - nhref = '#'.join(nhref, frag) + nhref = '#'.join((nhref, frag)) ref.href = nhref def fix_toc_entry(toc): @@ -444,7 +450,7 @@ class FlowSplitter(object): if href == self.item.href: nhref = self.anchor_map[frag if frag else None] if frag: - nhref = '#'.join(nhref, frag) + nhref = '#'.join((nhref, frag)) toc.href = nhref for x in toc: fix_toc_entry(x) diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index ef72414f5a..f71eb88ea5 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -49,7 +49,7 @@ class OEBWriter(object): def __call__(self, oeb, path): """ - Read the book in the :class:`OEBBook` object :param:`oeb` to a file + Write the book in the :class:`OEBBook` object :param:`oeb` to a folder at :param:`path`. """ version = int(self.version[0])