diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index f52c42811b..682c82cd1b 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -290,6 +290,7 @@ from calibre.ebooks.comic.input import ComicInput
from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.epub.output import EPUBOutput
+from calibre.ebooks.mobi.output import MOBIOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.input import PMLInput
@@ -309,9 +310,9 @@ from calibre.devices.jetbook.driver import JETBOOK
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput,
FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput,
- PMLOutput]
-plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \
- JETBOOK]
+ PMLOutput, MOBIOutput]
+plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY,
+ EB600, JETBOOK]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index 7920b823de..3a89a9b156 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
+ def preprocess_html(self, html):
+ '''
+ This method is called by the conversion pipeline on all HTML before it
+ is parsed. It is meant to be used to do any required preprocessing on
+ the HTML, like removing hard line breaks, etc.
+
+ :param html: A unicode string
+ :return: A unicode string
+ '''
+ return html
+
+
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 53b1a2065d..f07c2d86ef 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -126,9 +126,10 @@ def add_pipeline_options(parser, plumber):
'STRUCTURE DETECTION' : (
_('Control auto-detection of document structure.'),
[
- 'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
+ 'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before',
+ 'preprocess_html',
]
),
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index d1630a25f2..7c654f924d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -131,18 +131,6 @@ OptionRecommendation(name='linearize_tables',
)
),
-OptionRecommendation(name='dont_split_on_page_breaks',
- recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Turn off splitting at page breaks. Normally, input '
- 'files are automatically split at every page break into '
- 'two files. This gives an output ebook that can be '
- 'parsed faster and with less resources. However, '
- 'splitting is slow and if your source file contains a '
- 'very large number of page breaks, you should turn off '
- 'splitting on page breaks.'
- )
- ),
-
OptionRecommendation(name='level1_toc',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('XPath expression that specifies all tags that '
@@ -312,6 +300,14 @@ OptionRecommendation(name='insert_metadata',
)
),
+OptionRecommendation(name='preprocess_html',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Attempt to detect and correct hard line breaks and other '
+ 'problems in the source file. This may make things worse, so use '
+ 'with care.'
+ )
+ ),
+
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
@@ -580,7 +576,8 @@ OptionRecommendation(name='list_recipes',
self.log('Debug input called, aborting the rest of the pipeline.')
return
if not hasattr(self.oeb, 'manifest'):
- self.oeb = create_oebbook(self.log, self.oeb, self.opts)
+ self.oeb = create_oebbook(self.log, self.oeb, self.opts,
+ self.input_plugin)
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
pr(0., _('Running transforms on ebook...'))
@@ -619,20 +616,14 @@ OptionRecommendation(name='list_recipes',
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
lineh=self.opts.line_height,
- untable=self.opts.linearize_tables)
+ untable=self.output_plugin.file_type in ('mobi','lit'),
+ unfloat=self.output_plugin.file_type in ('mobi', 'lit'))
flattener(self.oeb, self.opts)
- if self.opts.linearize_tables:
+ if self.opts.linearize_tables and \
+ self.output_plugin.file_type not in ('mobi', 'lrf'):
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
LinearizeTables()(self.oeb, self.opts)
- pr(0.7)
-
- from calibre.ebooks.oeb.transforms.split import Split
- pbx = accelerators.get('pagebreaks', None)
- split = Split(not self.opts.dont_split_on_page_breaks,
- max_flow_size=self.opts.output_profile.flow_size,
- page_breaks_xpath=pbx)
- split(self.oeb, self.opts)
pr(0.9)
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
@@ -652,13 +643,14 @@ OptionRecommendation(name='list_recipes',
self.opts, self.log)
self.ui_reporter(1.)
-def create_oebbook(log, path_or_stream, opts, reader=None):
+def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
- html_preprocessor = HTMLPreProcessor()
- oeb = OEBBook(log, html_preprocessor=html_preprocessor,
+ html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
+ opts.preprocess_html)
+ oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook
log('Parsing all content...')
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 9bfe6d4255..76fc36708e 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -26,16 +26,16 @@ def sanitize_head(match):
def chap_head(match):
chap = match.group('chap')
title = match.group('title')
- if not title:
+ if not title:
return '
'+chap+'
\n'
- else:
+ else:
return ''+chap+'
\n'+title+'
\n'
def wrap_lines(match):
ital = match.group('ital')
- if not ital:
+ if not ital:
return ' '
- else:
+ else:
return ital+' '
def line_length(raw, percent):
@@ -106,7 +106,7 @@ class HTMLPreProcessor(object):
(re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'),
(re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'),
-
+
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
@@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
(re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
lambda match : '%s
'%(match.group(1),)),
]
+ def __init__(self, input_plugin_preprocess, plugin_preprocess):
+ self.input_plugin_preprocess = input_plugin_preprocess
+ self.plugin_preprocess = plugin_preprocess
def is_baen(self, src):
return re.compile(r'(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
]
-
+
rules = self.PDFTOHTML + line_length_rules
else:
rules = []
@@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html)
+ if self.plugin_preprocess:
+ html = self.input_plugin_preprocess(html)
+
return html
diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py
index d5f0a9349a..aba9bff0d8 100644
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@@ -28,7 +28,21 @@ class EPUBOutput(OutputFormatPlugin):
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated EPUB file to the '
'specified directory. The contents of the directory are first '
- 'deleted, so be careful.'))
+ 'deleted, so be careful.')),
+
+ OptionRecommendation(name='dont_split_on_page_breaks',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Turn off splitting at page breaks. Normally, input '
+ 'files are automatically split at every page break into '
+ 'two files. This gives an output ebook that can be '
+ 'parsed faster and with less resources. However, '
+ 'splitting is slow and if your source file contains a '
+ 'very large number of page breaks, you should turn off '
+ 'splitting on page breaks.'
+ )
+ ),
+
+
])
@@ -88,6 +102,13 @@ class EPUBOutput(OutputFormatPlugin):
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
+ from calibre.ebooks.oeb.transforms.split import Split
+ split = Split(not self.opts.dont_split_on_page_breaks,
+ max_flow_size=self.opts.output_profile.flow_size
+ )
+ split(self.oeb, self.opts)
+
+
self.workaround_ade_quirks()
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 252032a23d..255d975b1e 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook
- oeb = create_oebbook(log, opfpath, opts)
+ oeb = create_oebbook(log, opfpath, opts, self)
from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 2d726f7eeb..409482da29 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
- return create_oebbook(log, stream, options, reader=LitReader)
+ return create_oebbook(log, stream, options, self, reader=LitReader)
diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py
index 18f53317e0..a2d999ffc8 100644
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@@ -80,19 +80,6 @@ class MobiMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
- @classmethod
- def config(cls, cfg):
- group = cfg.add_group('mobiml', _('Mobipocket markup options.'))
- group('ignore_tables', ['--ignore-tables'], default=False,
- help=_('Render HTML tables as blocks of text instead of actual '
- 'tables. This is neccessary if the HTML contains very '
- 'large or complex tables.'))
- return cfg
-
- @classmethod
- def generate(cls, opts):
- return cls(ignore_tables=opts.ignore_tables)
-
def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py
new file mode 100644
index 0000000000..1866888ab1
--- /dev/null
+++ b/src/calibre/ebooks/mobi/output.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__ = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+
+from calibre.customize.conversion import OutputFormatPlugin
+from calibre.customize.conversion import OptionRecommendation
+
+class MOBIOutput(OutputFormatPlugin):
+
+ name = 'MOBI Output'
+ author = 'Marshall T. Vandegrift'
+ file_type = 'mobi'
+
+ options = set([
+ OptionRecommendation(name='rescale_images', recommended_value=False,
+ help=_('Modify images to meet Palm device size limitations.')
+ ),
+ OptionRecommendation(name='prefer_author_sort',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('When present, use author sort field as author.')
+ ),
+ OptionRecommendation(name='toc_title', recommended_value=None,
+ help=_('Title for any generated in-line table of contents.')
+ ),
+ ])
+
+ def convert(self, oeb, output_path, input_plugin, opts, log):
+ self.log, self.opts, self.oeb = log, opts, oeb
+ from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, MobiWriter
+ from calibre.ebooks.mobi.mobiml import MobiMLizer
+ from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
+ from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
+ from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
+ imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
+ tocadder = HTMLTOCAdder(title=opts.toc_title)
+ tocadder(oeb, opts)
+ mangler = CaseMangler()
+ mangler(oeb, opts)
+ rasterizer = SVGRasterizer()
+ rasterizer(oeb, opts)
+ mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
+ mobimlizer(oeb, opts)
+ writer = MobiWriter(imagemax=imagemax,
+ prefer_author_sort=opts.prefer_author_sort)
+ writer(oeb, output_path)
+
diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py
index c521ba9977..e16deeccda 100644
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@@ -6,8 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift '
-import sys
-import os
from struct import pack
import time
import random
@@ -16,24 +14,14 @@ import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
-import logging
from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
-from calibre.ebooks.oeb.base import OEBBook
-from calibre.ebooks.oeb.profile import Context
-from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
-from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
-from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
-from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
-from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
-from calibre.ebooks.mobi.mobiml import MBP_NS, MobiMLizer
-from calibre.customize.ui import run_plugins_on_postprocess
-from calibre.utils.config import Config, StringConfig
+from calibre.ebooks.mobi.mobiml import MBP_NS
# TODO:
# - Allow override CSS (?)
@@ -293,58 +281,22 @@ class Serializer(object):
buffer.write('%010d' % ioff)
-class MobiFlattener(object):
- def config(self, cfg):
- return cfg
-
- def generate(self, opts):
- return self
-
- def __call__(self, oeb, context):
- fbase = context.dest.fbase
- fkey = context.dest.fnums.values()
- flattener = CSSFlattener(
- fbase=fbase, fkey=fkey, unfloat=True, untable=True)
- return flattener(oeb, context)
-
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
- DEFAULT_PROFILE = 'CybookG3'
-
- TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer,
- ManifestTrimmer, MobiMLizer]
-
- def __init__(self, compression=None, imagemax=None,
+ def __init__(self, compression=PALMDOC, imagemax=None,
prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort
- @classmethod
- def config(cls, cfg):
- """Add any book-writing options to the :class:`Config` object
- :param:`cfg`.
- """
- mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.'))
- mobi('compress', ['--compress'], default=False,
- help=_('Compress file text using PalmDOC compression. '
- 'Results in smaller files, but takes a long time to run.'))
- mobi('rescale_images', ['--rescale-images'], default=False,
- help=_('Modify images to meet Palm device size limitations.'))
- mobi('prefer_author_sort', ['--prefer-author-sort'], default=False,
- help=_('When present, use the author sorting information for '
- 'generating the Mobipocket author metadata.'))
- return cfg
-
@classmethod
def generate(cls, opts):
"""Generate a Writer instance from command-line options."""
- compression = PALMDOC if opts.compress else UNCOMPRESSED
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
prefer_author_sort = opts.prefer_author_sort
- return cls(compression=compression, imagemax=imagemax,
+ return cls(compression=PALMDOC, imagemax=imagemax,
prefer_author_sort=prefer_author_sort)
def __call__(self, oeb, path):
@@ -577,88 +529,4 @@ class MobiWriter(object):
self._write(record)
-def config(defaults=None):
- desc = _('Options to control the conversion to MOBI')
- _profiles = list(sorted(Context.PROFILES.keys()))
- if defaults is None:
- c = Config('mobi', desc)
- else:
- c = StringConfig(defaults, desc)
- profiles = c.add_group('profiles', _('Device renderer profiles. '
- 'Affects conversion of font sizes, image rescaling and rasterization '
- 'of tables. Valid profiles are: %s.') % ', '.join(_profiles))
- profiles('source_profile', ['--source-profile'],
- default='Browser', choices=_profiles,
- help=_("Source renderer profile. Default is %default."))
- profiles('dest_profile', ['--dest-profile'],
- default='CybookG3', choices=_profiles,
- help=_("Destination renderer profile. Default is %default."))
- c.add_opt('encoding', ['--encoding'], default=None,
- help=_('Character encoding for HTML files. Default is to auto detect.'))
- return c
-
-
-def option_parser():
- c = config()
- parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
- parser.add_option(
- '-o', '--output', default=None,
- help=_('Output file. Default is derived from input filename.'))
- parser.add_option(
- '-v', '--verbose', default=0, action='count',
- help=_('Useful for debugging.'))
- return parser
-
-def oeb2mobi(opts, inpath):
- logger = Logger(logging.getLogger('oeb2mobi'))
- logger.setup_cli_handler(opts.verbose)
- outpath = opts.output
- if outpath is None:
- outpath = os.path.basename(inpath)
- outpath = os.path.splitext(outpath)[0] + '.mobi'
- source = opts.source_profile
- if source not in Context.PROFILES:
- logger.error(_('Unknown source profile %r') % source)
- return 1
- dest = opts.dest_profile
- if dest not in Context.PROFILES:
- logger.error(_('Unknown destination profile %r') % dest)
- return 1
- compression = PALMDOC if opts.compress else UNCOMPRESSED
- imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
- context = Context(source, dest)
- oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding)
- tocadder = HTMLTOCAdder(title=opts.toc_title)
- tocadder.transform(oeb, context)
- mangler = CaseMangler()
- mangler.transform(oeb, context)
- fbase = context.dest.fbase
- fkey = context.dest.fnums.values()
- flattener = CSSFlattener(
- fbase=fbase, fkey=fkey, unfloat=True, untable=True)
- flattener.transform(oeb, context)
- rasterizer = SVGRasterizer()
- rasterizer.transform(oeb, context)
- trimmer = ManifestTrimmer()
- trimmer.transform(oeb, context)
- mobimlizer = MobiMLizer(ignore_tables=opts.ignore_tables)
- mobimlizer.transform(oeb, context)
- writer = MobiWriter(compression=compression, imagemax=imagemax,
- prefer_author_sort=opts.prefer_author_sort)
- writer.dump(oeb, outpath)
- run_plugins_on_postprocess(outpath, 'mobi')
- logger.info(_('Output written to ') + outpath)
-
-def main(argv=sys.argv):
- parser = option_parser()
- opts, args = parser.parse_args(argv[1:])
- if len(args) != 1:
- parser.print_help()
- return 1
- inpath = args[0]
- retval = oeb2mobi(opts, inpath)
- return retval
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index faf2d02dc4..bbac34f0b1 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -22,8 +22,7 @@ from cssutils import CSSParser
from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
-from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
- CSSPreProcessor
+from calibre.ebooks.conversion.preprocess import CSSPreProcessor
XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
@@ -1506,7 +1505,7 @@ class OEBBook(object):
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger,
- html_preprocessor=HTMLPreProcessor(),
+ html_preprocessor,
css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments: