Sync to pluginize

This commit is contained in:
John Schember 2009-05-02 18:13:51 -04:00
commit 9aae507c07
12 changed files with 128 additions and 190 deletions

View File

@ -290,6 +290,7 @@ from calibre.ebooks.comic.input import ComicInput
from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.mobi.output import MOBIOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.input import PMLInput
@ -309,9 +310,9 @@ from calibre.devices.jetbook.driver import JETBOOK
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput,
FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput,
PMLOutput]
plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \
JETBOOK]
PMLOutput, MOBIOutput]
plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY,
EB600, JETBOOK]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
def preprocess_html(self, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return

View File

@ -126,9 +126,10 @@ def add_pipeline_options(parser, plumber):
'STRUCTURE DETECTION' : (
_('Control auto-detection of document structure.'),
[
'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before',
'preprocess_html',
]
),

View File

@ -131,18 +131,6 @@ OptionRecommendation(name='linearize_tables',
)
),
OptionRecommendation(name='dont_split_on_page_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn off splitting at page breaks. Normally, input '
'files are automatically split at every page break into '
'two files. This gives an output ebook that can be '
'parsed faster and with less resources. However, '
'splitting is slow and if your source file contains a '
'very large number of page breaks, you should turn off '
'splitting on page breaks.'
)
),
OptionRecommendation(name='level1_toc',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('XPath expression that specifies all tags that '
@ -312,6 +300,14 @@ OptionRecommendation(name='insert_metadata',
)
),
OptionRecommendation(name='preprocess_html',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Attempt to detect and correct hard line breaks and other '
'problems in the source file. This may make things worse, so use '
'with care.'
)
),
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
@ -580,7 +576,8 @@ OptionRecommendation(name='list_recipes',
self.log('Debug input called, aborting the rest of the pipeline.')
return
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts)
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin)
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
pr(0., _('Running transforms on ebook...'))
@ -619,20 +616,14 @@ OptionRecommendation(name='list_recipes',
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
lineh=self.opts.line_height,
untable=self.opts.linearize_tables)
untable=self.output_plugin.file_type in ('mobi','lit'),
unfloat=self.output_plugin.file_type in ('mobi', 'lit'))
flattener(self.oeb, self.opts)
if self.opts.linearize_tables:
if self.opts.linearize_tables and \
self.output_plugin.file_type not in ('mobi', 'lrf'):
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
LinearizeTables()(self.oeb, self.opts)
pr(0.7)
from calibre.ebooks.oeb.transforms.split import Split
pbx = accelerators.get('pagebreaks', None)
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=self.opts.output_profile.flow_size,
page_breaks_xpath=pbx)
split(self.oeb, self.opts)
pr(0.9)
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
@ -652,13 +643,14 @@ OptionRecommendation(name='list_recipes',
self.opts, self.log)
self.ui_reporter(1.)
def create_oebbook(log, path_or_stream, opts, reader=None):
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor()
oeb = OEBBook(log, html_preprocessor=html_preprocessor,
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook
log('Parsing all content...')

View File

@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def __init__(self, input_plugin_preprocess, plugin_preprocess):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html)
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
return html

View File

@ -28,7 +28,21 @@ class EPUBOutput(OutputFormatPlugin):
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated EPUB file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.'))
'deleted, so be careful.')),
OptionRecommendation(name='dont_split_on_page_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn off splitting at page breaks. Normally, input '
'files are automatically split at every page break into '
'two files. This gives an output ebook that can be '
'parsed faster and with less resources. However, '
'splitting is slow and if your source file contains a '
'very large number of page breaks, you should turn off '
'splitting on page breaks.'
)
),
])
@ -88,6 +102,13 @@ class EPUBOutput(OutputFormatPlugin):
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.oeb.transforms.split import Split
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=self.opts.output_profile.flow_size
)
split(self.oeb, self.opts)
self.workaround_ade_quirks()
from calibre.ebooks.oeb.transforms.rescale import RescaleImages

View File

@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts)
oeb = create_oebbook(log, opfpath, opts, self)
from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts)

View File

@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, reader=LitReader)
return create_oebbook(log, stream, options, self, reader=LitReader)

View File

@ -80,19 +80,6 @@ class MobiMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
@classmethod
def config(cls, cfg):
group = cfg.add_group('mobiml', _('Mobipocket markup options.'))
group('ignore_tables', ['--ignore-tables'], default=False,
help=_('Render HTML tables as blocks of text instead of actual '
'tables. This is neccessary if the HTML contains very '
'large or complex tables.'))
return cfg
@classmethod
def generate(cls, opts):
return cls(ignore_tables=opts.ignore_tables)
def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
class MOBIOutput(OutputFormatPlugin):
name = 'MOBI Output'
author = 'Marshall T. Vandegrift'
file_type = 'mobi'
options = set([
OptionRecommendation(name='rescale_images', recommended_value=False,
help=_('Modify images to meet Palm device size limitations.')
),
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
])
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, MobiWriter
from calibre.ebooks.mobi.mobiml import MobiMLizer
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
tocadder = HTMLTOCAdder(title=opts.toc_title)
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
writer = MobiWriter(imagemax=imagemax,
prefer_author_sort=opts.prefer_author_sort)
writer(oeb, output_path)

View File

@ -6,8 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import sys
import os
from struct import pack
import time
import random
@ -16,24 +14,14 @@ import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
import logging
from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS, MobiMLizer
from calibre.customize.ui import run_plugins_on_postprocess
from calibre.utils.config import Config, StringConfig
from calibre.ebooks.mobi.mobiml import MBP_NS
# TODO:
# - Allow override CSS (?)
@ -293,58 +281,22 @@ class Serializer(object):
buffer.write('%010d' % ioff)
class MobiFlattener(object):
def config(self, cfg):
return cfg
def generate(self, opts):
return self
def __call__(self, oeb, context):
fbase = context.dest.fbase
fkey = context.dest.fnums.values()
flattener = CSSFlattener(
fbase=fbase, fkey=fkey, unfloat=True, untable=True)
return flattener(oeb, context)
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
DEFAULT_PROFILE = 'CybookG3'
TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer,
ManifestTrimmer, MobiMLizer]
def __init__(self, compression=None, imagemax=None,
def __init__(self, compression=PALMDOC, imagemax=None,
prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort
@classmethod
def config(cls, cfg):
"""Add any book-writing options to the :class:`Config` object
:param:`cfg`.
"""
mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.'))
mobi('compress', ['--compress'], default=False,
help=_('Compress file text using PalmDOC compression. '
'Results in smaller files, but takes a long time to run.'))
mobi('rescale_images', ['--rescale-images'], default=False,
help=_('Modify images to meet Palm device size limitations.'))
mobi('prefer_author_sort', ['--prefer-author-sort'], default=False,
help=_('When present, use the author sorting information for '
'generating the Mobipocket author metadata.'))
return cfg
@classmethod
def generate(cls, opts):
"""Generate a Writer instance from command-line options."""
compression = PALMDOC if opts.compress else UNCOMPRESSED
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
prefer_author_sort = opts.prefer_author_sort
return cls(compression=compression, imagemax=imagemax,
return cls(compression=PALMDOC, imagemax=imagemax,
prefer_author_sort=prefer_author_sort)
def __call__(self, oeb, path):
@ -577,88 +529,4 @@ class MobiWriter(object):
self._write(record)
def config(defaults=None):
desc = _('Options to control the conversion to MOBI')
_profiles = list(sorted(Context.PROFILES.keys()))
if defaults is None:
c = Config('mobi', desc)
else:
c = StringConfig(defaults, desc)
profiles = c.add_group('profiles', _('Device renderer profiles. '
'Affects conversion of font sizes, image rescaling and rasterization '
'of tables. Valid profiles are: %s.') % ', '.join(_profiles))
profiles('source_profile', ['--source-profile'],
default='Browser', choices=_profiles,
help=_("Source renderer profile. Default is %default."))
profiles('dest_profile', ['--dest-profile'],
default='CybookG3', choices=_profiles,
help=_("Destination renderer profile. Default is %default."))
c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.'))
return c
def option_parser():
c = config()
parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf')
parser.add_option(
'-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option(
'-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def oeb2mobi(opts, inpath):
logger = Logger(logging.getLogger('oeb2mobi'))
logger.setup_cli_handler(opts.verbose)
outpath = opts.output
if outpath is None:
outpath = os.path.basename(inpath)
outpath = os.path.splitext(outpath)[0] + '.mobi'
source = opts.source_profile
if source not in Context.PROFILES:
logger.error(_('Unknown source profile %r') % source)
return 1
dest = opts.dest_profile
if dest not in Context.PROFILES:
logger.error(_('Unknown destination profile %r') % dest)
return 1
compression = PALMDOC if opts.compress else UNCOMPRESSED
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
context = Context(source, dest)
oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding)
tocadder = HTMLTOCAdder(title=opts.toc_title)
tocadder.transform(oeb, context)
mangler = CaseMangler()
mangler.transform(oeb, context)
fbase = context.dest.fbase
fkey = context.dest.fnums.values()
flattener = CSSFlattener(
fbase=fbase, fkey=fkey, unfloat=True, untable=True)
flattener.transform(oeb, context)
rasterizer = SVGRasterizer()
rasterizer.transform(oeb, context)
trimmer = ManifestTrimmer()
trimmer.transform(oeb, context)
mobimlizer = MobiMLizer(ignore_tables=opts.ignore_tables)
mobimlizer.transform(oeb, context)
writer = MobiWriter(compression=compression, imagemax=imagemax,
prefer_author_sort=opts.prefer_author_sort)
writer.dump(oeb, outpath)
run_plugins_on_postprocess(outpath, 'mobi')
logger.info(_('Output written to ') + outpath)
def main(argv=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(argv[1:])
if len(args) != 1:
parser.print_help()
return 1
inpath = args[0]
retval = oeb2mobi(opts, inpath)
return retval
if __name__ == '__main__':
sys.exit(main())

View File

@ -22,8 +22,7 @@ from cssutils import CSSParser
from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
CSSPreProcessor
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -1506,7 +1505,7 @@ class OEBBook(object):
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger,
html_preprocessor=HTMLPreProcessor(),
html_preprocessor,
css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False):
"""Create empty book. Arguments: