diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index f52c42811b..682c82cd1b 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -290,6 +290,7 @@ from calibre.ebooks.comic.input import ComicInput from calibre.web.feeds.input import RecipeInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput +from calibre.ebooks.mobi.output import MOBIOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.input import PMLInput @@ -309,9 +310,9 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, - PMLOutput] -plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \ - JETBOOK] + PMLOutput, MOBIOutput] +plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, + EB600, JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 7920b823de..3a89a9b156 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() + def preprocess_html(self, html): + ''' + This method is called by the conversion pipeline on all HTML before it + is parsed. It is meant to be used to do any required preprocessing on + the HTML, like removing hard line breaks, etc. + + :param html: A unicode string + :return: A unicode string + ''' + return html + + def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 53b1a2065d..f07c2d86ef 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -126,9 +126,10 @@ def add_pipeline_options(parser, plumber): 'STRUCTURE DETECTION' : ( _('Control auto-detection of document structure.'), [ - 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', + 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', + 'preprocess_html', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index d1630a25f2..7c654f924d 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -131,18 +131,6 @@ OptionRecommendation(name='linearize_tables', ) ), -OptionRecommendation(name='dont_split_on_page_breaks', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Turn off splitting at page breaks. Normally, input ' - 'files are automatically split at every page break into ' - 'two files. This gives an output ebook that can be ' - 'parsed faster and with less resources. However, ' - 'splitting is slow and if your source file contains a ' - 'very large number of page breaks, you should turn off ' - 'splitting on page breaks.' - ) - ), - OptionRecommendation(name='level1_toc', recommended_value=None, level=OptionRecommendation.LOW, help=_('XPath expression that specifies all tags that ' @@ -312,6 +300,14 @@ OptionRecommendation(name='insert_metadata', ) ), +OptionRecommendation(name='preprocess_html', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Attempt to detect and correct hard line breaks and other ' + 'problems in the source file. This may make things worse, so use ' + 'with care.' + ) + ), + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, @@ -580,7 +576,8 @@ OptionRecommendation(name='list_recipes', self.log('Debug input called, aborting the rest of the pipeline.') return if not hasattr(self.oeb, 'manifest'): - self.oeb = create_oebbook(self.log, self.oeb, self.opts) + self.oeb = create_oebbook(self.log, self.oeb, self.opts, + self.input_plugin) pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) pr(0., _('Running transforms on ebook...')) @@ -619,20 +616,14 @@ OptionRecommendation(name='list_recipes', flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=self.opts.line_height, - untable=self.opts.linearize_tables) + untable=self.output_plugin.file_type in ('mobi','lit'), + unfloat=self.output_plugin.file_type in ('mobi', 'lit')) flattener(self.oeb, self.opts) - if self.opts.linearize_tables: + if self.opts.linearize_tables and \ + self.output_plugin.file_type not in ('mobi', 'lrf'): from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables LinearizeTables()(self.oeb, self.opts) - pr(0.7) - - from calibre.ebooks.oeb.transforms.split import Split - pbx = accelerators.get('pagebreaks', None) - split = Split(not self.opts.dont_split_on_page_breaks, - max_flow_size=self.opts.output_profile.flow_size, - page_breaks_xpath=pbx) - split(self.oeb, self.opts) pr(0.9) from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer @@ -652,13 +643,14 @@ OptionRecommendation(name='list_recipes', self.opts, self.log) self.ui_reporter(1.) -def create_oebbook(log, path_or_stream, opts, reader=None): +def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): ''' Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor() - oeb = OEBBook(log, html_preprocessor=html_preprocessor, + html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, + opts.preprocess_html) + oeb = OEBBook(log, html_preprocessor, pretty_print=opts.pretty_print) # Read OEB Book into OEBBook log('Parsing all content...') diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 9bfe6d4255..76fc36708e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,16 +26,16 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') - if not title: + if not title: return '

'+chap+'


\n' - else: + else: return '

'+chap+'
\n'+title+'


\n' def wrap_lines(match): ital = match.group('ital') - if not ital: + if not ital: return ' ' - else: + else: return ital+' ' def line_length(raw, percent): @@ -106,7 +106,7 @@ class HTMLPreProcessor(object): (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'), (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), - + # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags @@ -151,6 +151,9 @@ class HTMLPreProcessor(object): (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), ] + def __init__(self, input_plugin_preprocess, plugin_preprocess): + self.input_plugin_preprocess = input_plugin_preprocess + self.plugin_preprocess = plugin_preprocess def is_baen(self, src): return re.compile(r')?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), ] - + rules = self.PDFTOHTML + line_length_rules else: rules = [] @@ -192,5 +195,8 @@ class HTMLPreProcessor(object): html = XMLDECL_RE.sub('', html) + if self.plugin_preprocess: + html = self.input_plugin_preprocess(html) + return html diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index d5f0a9349a..aba9bff0d8 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -28,7 +28,21 @@ class EPUBOutput(OutputFormatPlugin): OptionRecommendation(name='extract_to', help=_('Extract the contents of the generated EPUB file to the ' 'specified directory. The contents of the directory are first ' - 'deleted, so be careful.')) + 'deleted, so be careful.')), + + OptionRecommendation(name='dont_split_on_page_breaks', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Turn off splitting at page breaks. Normally, input ' + 'files are automatically split at every page break into ' + 'two files. This gives an output ebook that can be ' + 'parsed faster and with less resources. However, ' + 'splitting is slow and if your source file contains a ' + 'very large number of page breaks, you should turn off ' + 'splitting on page breaks.' + ) + ), + + ]) @@ -88,6 +102,13 @@ class EPUBOutput(OutputFormatPlugin): def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb + from calibre.ebooks.oeb.transforms.split import Split + split = Split(not self.opts.dont_split_on_page_breaks, + max_flow_size=self.opts.output_profile.flow_size + ) + split(self.oeb, self.opts) + + self.workaround_ade_quirks() from calibre.ebooks.oeb.transforms.rescale import RescaleImages diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 252032a23d..255d975b1e 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin): return opfpath from calibre.ebooks.conversion.plumber import create_oebbook - oeb = create_oebbook(log, opfpath, opts) + oeb = create_oebbook(log, opfpath, opts, self) from calibre.ebooks.oeb.transforms.package import Package Package(os.getcwdu())(oeb, opts) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 2d726f7eeb..409482da29 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin): accelerators): from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.conversion.plumber import create_oebbook - return create_oebbook(log, stream, options, reader=LitReader) + return create_oebbook(log, stream, options, self, reader=LitReader) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 18f53317e0..a2d999ffc8 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -80,19 +80,6 @@ class MobiMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables - @classmethod - def config(cls, cfg): - group = cfg.add_group('mobiml', _('Mobipocket markup options.')) - group('ignore_tables', ['--ignore-tables'], default=False, - help=_('Render HTML tables as blocks of text instead of actual ' - 'tables. This is neccessary if the HTML contains very ' - 'large or complex tables.')) - return cfg - - @classmethod - def generate(cls, opts): - return cls(ignore_tables=opts.ignore_tables) - def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py new file mode 100644 index 0000000000..1866888ab1 --- /dev/null +++ b/src/calibre/ebooks/mobi/output.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.customize.conversion import OptionRecommendation + +class MOBIOutput(OutputFormatPlugin): + + name = 'MOBI Output' + author = 'Marshall T. Vandegrift' + file_type = 'mobi' + + options = set([ + OptionRecommendation(name='rescale_images', recommended_value=False, + help=_('Modify images to meet Palm device size limitations.') + ), + OptionRecommendation(name='prefer_author_sort', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('When present, use author sort field as author.') + ), + OptionRecommendation(name='toc_title', recommended_value=None, + help=_('Title for any generated in-line table of contents.') + ), + ]) + + def convert(self, oeb, output_path, input_plugin, opts, log): + self.log, self.opts, self.oeb = log, opts, oeb + from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, MobiWriter + from calibre.ebooks.mobi.mobiml import MobiMLizer + from calibre.ebooks.oeb.transforms.manglecase import CaseMangler + from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer + from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder + imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None + tocadder = HTMLTOCAdder(title=opts.toc_title) + tocadder(oeb, opts) + mangler = CaseMangler() + mangler(oeb, opts) + rasterizer = SVGRasterizer() + rasterizer(oeb, opts) + mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) + mobimlizer(oeb, opts) + writer = MobiWriter(imagemax=imagemax, + prefer_author_sort=opts.prefer_author_sort) + writer(oeb, output_path) + diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index c521ba9977..e16deeccda 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -6,8 +6,6 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys -import os from struct import pack import time import random @@ -16,24 +14,14 @@ import re from itertools import izip, count from collections import defaultdict from urlparse import urldefrag -import logging from PIL import Image from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ OEB_RASTER_IMAGES from calibre.ebooks.oeb.base import namespace, prefixname from calibre.ebooks.oeb.base import urlnormalize -from calibre.ebooks.oeb.base import OEBBook -from calibre.ebooks.oeb.profile import Context -from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener -from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer -from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer -from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder -from calibre.ebooks.oeb.transforms.manglecase import CaseMangler from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi -from calibre.ebooks.mobi.mobiml import MBP_NS, MobiMLizer -from calibre.customize.ui import run_plugins_on_postprocess -from calibre.utils.config import Config, StringConfig +from calibre.ebooks.mobi.mobiml import MBP_NS # TODO: # - Allow override CSS (?) @@ -293,58 +281,22 @@ class Serializer(object): buffer.write('%010d' % ioff) -class MobiFlattener(object): - def config(self, cfg): - return cfg - - def generate(self, opts): - return self - - def __call__(self, oeb, context): - fbase = context.dest.fbase - fkey = context.dest.fnums.values() - flattener = CSSFlattener( - fbase=fbase, fkey=fkey, unfloat=True, untable=True) - return flattener(oeb, context) - class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') - DEFAULT_PROFILE = 'CybookG3' - - TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer, - ManifestTrimmer, MobiMLizer] - - def __init__(self, compression=None, imagemax=None, + def __init__(self, compression=PALMDOC, imagemax=None, prefer_author_sort=False): self._compression = compression or UNCOMPRESSED self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort - @classmethod - def config(cls, cfg): - """Add any book-writing options to the :class:`Config` object - :param:`cfg`. - """ - mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('compress', ['--compress'], default=False, - help=_('Compress file text using PalmDOC compression. ' - 'Results in smaller files, but takes a long time to run.')) - mobi('rescale_images', ['--rescale-images'], default=False, - help=_('Modify images to meet Palm device size limitations.')) - mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, - help=_('When present, use the author sorting information for ' - 'generating the Mobipocket author metadata.')) - return cfg - @classmethod def generate(cls, opts): """Generate a Writer instance from command-line options.""" - compression = PALMDOC if opts.compress else UNCOMPRESSED imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None prefer_author_sort = opts.prefer_author_sort - return cls(compression=compression, imagemax=imagemax, + return cls(compression=PALMDOC, imagemax=imagemax, prefer_author_sort=prefer_author_sort) def __call__(self, oeb, path): @@ -577,88 +529,4 @@ class MobiWriter(object): self._write(record) -def config(defaults=None): - desc = _('Options to control the conversion to MOBI') - _profiles = list(sorted(Context.PROFILES.keys())) - if defaults is None: - c = Config('mobi', desc) - else: - c = StringConfig(defaults, desc) - profiles = c.add_group('profiles', _('Device renderer profiles. ' - 'Affects conversion of font sizes, image rescaling and rasterization ' - 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) - profiles('source_profile', ['--source-profile'], - default='Browser', choices=_profiles, - help=_("Source renderer profile. Default is %default.")) - profiles('dest_profile', ['--dest-profile'], - default='CybookG3', choices=_profiles, - help=_("Destination renderer profile. Default is %default.")) - c.add_opt('encoding', ['--encoding'], default=None, - help=_('Character encoding for HTML files. Default is to auto detect.')) - return c - - -def option_parser(): - c = config() - parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') - parser.add_option( - '-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option( - '-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def oeb2mobi(opts, inpath): - logger = Logger(logging.getLogger('oeb2mobi')) - logger.setup_cli_handler(opts.verbose) - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] + '.mobi' - source = opts.source_profile - if source not in Context.PROFILES: - logger.error(_('Unknown source profile %r') % source) - return 1 - dest = opts.dest_profile - if dest not in Context.PROFILES: - logger.error(_('Unknown destination profile %r') % dest) - return 1 - compression = PALMDOC if opts.compress else UNCOMPRESSED - imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None - context = Context(source, dest) - oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding) - tocadder = HTMLTOCAdder(title=opts.toc_title) - tocadder.transform(oeb, context) - mangler = CaseMangler() - mangler.transform(oeb, context) - fbase = context.dest.fbase - fkey = context.dest.fnums.values() - flattener = CSSFlattener( - fbase=fbase, fkey=fkey, unfloat=True, untable=True) - flattener.transform(oeb, context) - rasterizer = SVGRasterizer() - rasterizer.transform(oeb, context) - trimmer = ManifestTrimmer() - trimmer.transform(oeb, context) - mobimlizer = MobiMLizer(ignore_tables=opts.ignore_tables) - mobimlizer.transform(oeb, context) - writer = MobiWriter(compression=compression, imagemax=imagemax, - prefer_author_sort=opts.prefer_author_sort) - writer.dump(oeb, outpath) - run_plugins_on_postprocess(outpath, 'mobi') - logger.info(_('Output written to ') + outpath) - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = oeb2mobi(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index faf2d02dc4..bbac34f0b1 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -22,8 +22,7 @@ from cssutils import CSSParser from calibre.translations.dynamic import translate from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS -from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \ - CSSPreProcessor +from calibre.ebooks.conversion.preprocess import CSSPreProcessor XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' @@ -1506,7 +1505,7 @@ class OEBBook(object): COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') def __init__(self, logger, - html_preprocessor=HTMLPreProcessor(), + html_preprocessor, css_preprocessor=CSSPreProcessor(), encoding='utf-8', pretty_print=False): """Create empty book. Arguments: