Demonstrable modularization of e-book conversion.

This commit is contained in:
Marshall T. Vandegrift 2009-02-11 10:00:54 -05:00
parent e5984c02c7
commit 5dca631114
12 changed files with 210 additions and 73 deletions

View File

@ -802,6 +802,7 @@ class LitFile(object):
class LitContainer(object):
"""Simple Container-interface, read-only accessor for LIT files."""
def __init__(self, filename_or_stream):
self._litfile = LitFile(filename_or_stream)

View File

@ -82,7 +82,15 @@ class MobiMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
def transform(self, oeb, context):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
self.profile = profile = context.dest

View File

@ -295,6 +295,11 @@ class Serializer(object):
class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
DEFAULT_PROFILE = 'CybookG3'
TRANSFORMS = [HTMLTOCAdder, CaseMangler, CSSFlattener, SVGRasterizer,
ManifestTrimmer, MobiMLizer]
def __init__(self, compression=None, imagemax=None,
prefer_author_sort=False):
@ -302,7 +307,32 @@ class MobiWriter(object):
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort
def dump(self, oeb, path):
@classmethod
def config(cls, cfg):
"""Add any book-writing options to the :class:`Config` object
:param:`cfg`.
"""
mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.'))
mobi('compress', ['--compress'], default=False,
help=_('Compress file text using PalmDOC compression. '
'Results in smaller files, but takes a long time to run.'))
mobi('rescale_images', ['--rescale-images'], default=False,
help=_('Modify images to meet Palm device size limitations.'))
mobi('prefer_author_sort', ['--prefer-author-sort'], default=False,
help=_('When present, use the author sorting information for '
'generating the Mobipocket author metadata.'))
return cfg
@classmethod
def generate(cls, opts):
"""Generate a Writer instance from command-line options."""
compression = PALMDOC if opts.compress else UNCOMPRESSED
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
prefer_author_sort = opts.prefer_author_sort
return cls(compression=compression, imagemax=imagemax,
prefer_author_sort=prefer_author_sort)
def __call__(self, oeb, path):
if hasattr(path, 'write'):
return self._dump_stream(oeb, path)
with open(path, 'w+b') as stream:
@ -533,20 +563,12 @@ def config(defaults=None):
c = StringConfig(defaults, desc)
mobi = c.add_group('mobipocket', _('Mobipocket-specific options.'))
mobi('compress', ['--compress'], default=False,
help=_('Compress file text using PalmDOC compression. '
'Results in smaller files, but takes a long time to run.'))
mobi('rescale_images', ['--rescale-images'], default=False,
help=_('Modify images to meet Palm device size limitations.'))
mobi('toc_title', ['--toc-title'], default=None,
help=_('Title for any generated in-line table of contents.'))
mobi('ignore_tables', ['--ignore-tables'], default=False,
help=_('Render HTML tables as blocks of text instead of actual '
'tables. This is neccessary if the HTML contains very large '
'or complex tables.'))
mobi('prefer_author_sort', ['--prefer-author-sort'], default=False,
help=_('When present, use the author sorting information for '
'generating the Mobipocket author metadata.'))
profiles = c.add_group('profiles', _('Device renderer profiles. '
'Affects conversion of font sizes, image rescaling and rasterization '
'of tables. Valid profiles are: %s.') % ', '.join(_profiles))

View File

@ -820,8 +820,10 @@ class Manifest(object):
def __iter__(self):
for item in self.items:
yield item
values = __iter__
def values(self):
return list(self.items)
def __contains__(self, item):
return item in self.items
@ -1134,7 +1136,7 @@ class TOC(object):
node.to_opf1(tour)
return tour
def to_ncx(self, parent, depth=1):
def to_ncx(self, parent):
for node in self.nodes:
id = node.id or unicode(uuid.uuid4())
attrib = {'id': id, 'playOrder': '0'}
@ -1143,9 +1145,8 @@ class TOC(object):
point = element(parent, NCX('navPoint'), attrib=attrib)
label = etree.SubElement(point, NCX('navLabel'))
element(label, NCX('text')).text = node.title
href = node.href if depth > 1 else urldefrag(node.href)[0]
element(point, NCX('content'), src=href)
node.to_ncx(point, depth+1)
element(point, NCX('content'), src=node.href)
node.to_ncx(point)
return parent

View File

@ -6,20 +6,93 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import os
import sys, os, logging
from itertools import chain
from calibre.ebooks.oeb.base import OEBError
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.mobi.writer import MobiWriter
from calibre.ebooks.oeb.base import Logger, OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.utils.config import Config
__all__ = ['get_reader']
READER_REGISTRY = {
'.opf': OEBReader,
'.lit': LitReader,
REGISTRY = {
'.opf': (OEBReader, None),
'.lit': (LitReader, LitWriter),
'.mobi': (MobiReader, MobiWriter),
}
def ReaderFactory(path):
ext = os.path.splitext(path)[1].lower()
if not ext:
if os.path.isdir(path):
return OEBReader
return READER_REGISTRY[ext]()
ext = os.path.splitext(path)[1].lower()
Reader = REGISTRY.get(ext, (None, None))[0]
if Reader is None:
raise OEBError('Unknown e-book file extension %r' % ext)
return Reader
def WriterFactory(path):
if os.path.isdir(path):
return OEBWriter
ext = os.path.splitext(path)[1].lower()
if not os.path.exists(path) and not ext:
return OEBWriter
Writer = REGISTRY.get(ext, (None, None))[1]
if Writer is None:
raise OEBError('Unknown e-book file extension %r' % ext)
return Writer
def option_parser(Reader, Writer):
cfg = Config('ebook-convert', _('Options to control e-book conversion.'))
Reader.config(cfg)
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
Transform.config(cfg)
Writer.config(cfg)
parser = cfg.option_parser()
parser.add_option('--encoding', default=None,
help=_('Character encoding for input. Default is to auto detect.'))
parser.add_option('-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option('-p', '--pretty-print', action='store_true',
default=False, help=_('Produce more human-readable XML output.'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def main(argv=sys.argv):
if len(argv) < 3:
print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]")
return 1
inpath, outpath = argv[1], argv[2]
Reader = ReaderFactory(inpath)
Writer = WriterFactory(outpath)
parser = option_parser(Reader, Writer)
opts, args = parser.parse_args(argv[3:])
if len(args) != 0:
parser.print_help()
return 1
logger = Logger(logging.getLogger('ebook-convert'))
logger.setup_cli_handler(opts.verbose)
encoding = opts.encoding
pretty_print = opts.pretty_print
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)
context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE)
reader = Reader.generate(opts)
writer = Writer.generate(opts)
transforms = []
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
transforms.append(Transform.generate(opts))
reader(oeb, inpath)
for transform in transforms:
transform(oeb, context)
writer(oeb, outpath)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -31,15 +31,39 @@ from calibre.ptempfile import TemporaryDirectory
__all__ = ['OEBReader']
class OEBReader(object):
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
Container = DirContainer
"""Container type used to access book files. Override in sub-classes."""
DEFAULT_PROFILE = 'PRS505'
"""Default renderer profile for content read with this Reader."""
TRANSFORMS = []
"""List of transforms to apply to content read with this Reader."""
def __init__(self):
return
@classmethod
def config(cls, cfg):
"""Add any book-reading options to the :class:`Config` object
:param:`cfg`.
"""
return
@classmethod
def generate(cls, opts):
"""Generate a Reader instance from command-line options."""
return cls()
def __call__(self, oeb, path):
"""Read the book at :param:`path` into the :class:`OEBBook` object
:param:`oeb`.
"""
self.oeb = oeb
self.logger = oeb.logger
oeb.container = self.Container(path)

View File

@ -94,7 +94,15 @@ class CSSFlattener(object):
self.unfloat = unfloat
self.untable = untable
def transform(self, oeb, context):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Flattening CSS and remapping font sizes...')
self.oeb = oeb
self.context = context

View File

@ -52,7 +52,18 @@ class HTMLTOCAdder(object):
self.title = title
self.style = style
def transform(self, oeb, context):
@classmethod
def config(cls, cfg):
group = cfg.add_group('htmltoc', _('HTML TOC generation options.'))
group('toc_title', ['--toc-title'], default=None,
help=_('Title for any generated in-line table of contents.'))
return cfg
@classmethod
def generate(cls, opts):
return cls(title=opts.toc_title)
def __call__(self, oeb, context):
if 'toc' in oeb.guide:
return
oeb.logger.info('Generating in-line TOC...')

View File

@ -29,7 +29,15 @@ CASE_MANGLER_CSS = """
TEXT_TRANSFORMS = set(['capitalize', 'uppercase', 'lowercase'])
class CaseMangler(object):
def transform(self, oeb, context):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Applying case-transforming CSS...')
self.oeb = oeb
self.profile = context.source

View File

@ -34,7 +34,15 @@ class SVGRasterizer(object):
if QApplication.instance() is None:
QApplication([])
def transform(self, oeb, context):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...')
self.oeb = oeb
self.profile = context.dest

View File

@ -17,7 +17,15 @@ from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
from calibre.ebooks.oeb.base import urlnormalize
class ManifestTrimmer(object):
def transform(self, oeb, context):
@classmethod
def config(cls, cfg):
return cfg
@classmethod
def generate(cls, opts):
return cls()
def __call__(self, oeb, context):
oeb.logger.info('Trimming unused files from manifest...')
used = set()
hrefs = oeb.manifest.hrefs

View File

@ -9,13 +9,16 @@ __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook
from calibre.utils.config import Config
__all__ = ['OEBWriter']
class OEBWriter(object):
DEFAULT_PROFILE = 'PRS505'
"""Default renderer profile for content written with this Writer."""
TRANSFORMS = []
"""List of transforms to apply to content written with this Writer."""
def __init__(self, version='2.0', page_map=False, pretty_print=False):
self.version = version
self.page_map = page_map
@ -23,6 +26,9 @@ class OEBWriter(object):
@classmethod
def config(cls, cfg):
"""Add any book-writing options to the :class:`Config` object
:param:`cfg`.
"""
oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.'))
versions = ['1.2', '2.0']
oeb('opf_version', ['--opf-version'], default='2.0', choices=versions,
@ -34,6 +40,7 @@ class OEBWriter(object):
@classmethod
def generate(cls, opts):
"""Generate a Writer instance from command-line options."""
version = opts.opf_version
page_map = opts.adobe_page_map
pretty_print = opts.pretty_print
@ -41,6 +48,9 @@ class OEBWriter(object):
pretty_print=pretty_print)
def __call__(self, oeb, path):
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file
at :param:`path`.
"""
version = int(self.version[0])
opfname = None
if os.path.splitext(path)[1].lower() == '.opf':
@ -63,48 +73,3 @@ class OEBWriter(object):
href = opfname
output.write(href, xml2str(data, pretty_print=pretty_print))
return
def option_parser():
cfg = Config('oeb', _('Options to control OEB conversion.'))
OEBWriter.config(cfg)
parser = cfg.option_parser()
parser.add_option('--encoding', default=None,
help=_('Character encoding for files. Default is to auto detect.'))
parser.add_option('-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option('-p', '--pretty-print', action='store_true',
default=False, help=_('Produce more human-readable XML output.'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def any2oeb(opts, inpath):
from calibre.ebooks.oeb.factory import ReaderFactory
logger = Logger(logging.getLogger('any2oeb'))
logger.setup_cli_handler(opts.verbose)
outpath = opts.output
if outpath is None:
outpath = os.path.basename(inpath)
outpath = os.path.splitext(outpath)[0]
encoding = opts.encoding
pretty_print = opts.pretty_print
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)
reader = ReaderFactory(inpath)
reader(oeb, inpath)
writer = OEBWriter.generate(opts)
writer(oeb, outpath)
return 0
def main(argv=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(argv[1:])
if len(args) != 1:
parser.print_help()
return 1
inpath = args[0]
retval = any2oeb(opts, inpath)
return retval
if __name__ == '__main__':
sys.exit(main())