More fixes and improvements. Etc etc etc.

This commit is contained in:
Marshall T. Vandegrift 2009-01-08 08:47:59 -05:00
parent c02491eddc
commit a7753d3420
6 changed files with 115 additions and 52 deletions

View File

@ -19,6 +19,8 @@ from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
MBP_NS = 'http://mobipocket.com/ns/mbp' MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name): return '{%s}%s' % (MBP_NS, name) def MBP(name): return '{%s}%s' % (MBP_NS, name)
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS}
HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td', 'th']) NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td', 'th'])
TABLE_TAGS = set(['table', 'tr', 'td', 'th']) TABLE_TAGS = set(['table', 'tr', 'td', 'th'])
@ -77,26 +79,34 @@ class FormatState(object):
class MobiMLizer(object): class MobiMLizer(object):
def __init__(self):
pass
def transform(self, oeb, context): def transform(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb self.oeb = oeb
self.profile = profile = context.dest self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items()) self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys()) self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
self.remove_html_cover()
self.mobimlize_spine() self.mobimlize_spine()
def remove_html_cover(self):
oeb = self.oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
oeb.manifest.remove(item)
def mobimlize_spine(self): def mobimlize_spine(self):
for item in self.oeb.spine: for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile) stylizer = Stylizer(item.data, item.href, self.oeb, self.profile)
data = item.data body = item.data.find(XHTML('body'))
data.remove(data.find(XHTML('head'))) nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
body = data.find(XHTML('body')) nbody = etree.SubElement(nroot, XHTML('body'))
nbody = etree.Element(XHTML('body'))
self.mobimlize_elem(body, stylizer, BlockState(nbody), self.mobimlize_elem(body, stylizer, BlockState(nbody),
[FormatState()]) [FormatState()])
data.replace(body, nbody) item.data = nroot
def mobimlize_font(self, ptsize): def mobimlize_font(self, ptsize):
return self.fnums[self.fmap[ptsize]] return self.fnums[self.fmap[ptsize]]
@ -116,7 +126,7 @@ class MobiMLizer(object):
lines = text.split('\n') lines = text.split('\n')
result = lines[:1] result = lines[:1]
for line in lines[1:]: for line in lines[1:]:
result.append(etree.Element('br')) result.append(etree.Element(XHTML('br')))
if line: if line:
result.append(line) result.append(line)
return result return result
@ -134,7 +144,7 @@ class MobiMLizer(object):
bstate.pbreak = False bstate.pbreak = False
if istate.ids: if istate.ids:
for id in istate.ids: for id in istate.ids:
etree.SubElement(body, 'a', attrib={'id': id}) etree.SubElement(body, XHTML('a'), attrib={'id': id})
istate.ids.clear() istate.ids.clear()
bstate.istate = None bstate.istate = None
bstate.anchor = None bstate.anchor = None
@ -147,22 +157,22 @@ class MobiMLizer(object):
elif indent != 0 and abs(indent) < self.profile.fbase: elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS: if tag in NESTABLE_TAGS:
para = wrapper = etree.SubElement(parent, tag) para = wrapper = etree.SubElement(parent, XHTML(tag))
bstate.nested.append(para) bstate.nested.append(para)
if tag == 'li' and len(istates) > 1: if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1 istates[-2].list_num += 1
para.attrib['value'] = str(istates[-2].list_num) para.attrib['value'] = str(istates[-2].list_num)
elif left > 0 and indent >= 0: elif left > 0 and indent >= 0:
para = wrapper = etree.SubElement(parent, 'blockquote') para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
para = wrapper para = wrapper
emleft = int(round(left / self.profile.fbase)) - 1 emleft = int(round(left / self.profile.fbase)) - 1
emleft = min((emleft, 10)) emleft = min((emleft, 10))
while emleft > 0: while emleft > 0:
para = etree.SubElement(para, 'blockquote') para = etree.SubElement(para, XHTML('blockquote'))
emleft -= 1 emleft -= 1
else: else:
ptag = tag if tag in HEADER_TAGS else 'p' ptag = tag if tag in HEADER_TAGS else 'p'
para = wrapper = etree.SubElement(parent, ptag) para = wrapper = etree.SubElement(parent, XHTML(ptag))
bstate.inline = bstate.para = para bstate.inline = bstate.para = para
vspace = bstate.vpadding + bstate.vmargin vspace = bstate.vpadding + bstate.vmargin
bstate.vpadding = bstate.vmargin = 0 bstate.vpadding = bstate.vmargin = 0
@ -174,7 +184,7 @@ class MobiMLizer(object):
vspace = int(round(vspace / self.profile.fbase)) vspace = int(round(vspace / self.profile.fbase))
index = max((0, len(body) - 1)) index = max((0, len(body) - 1))
while vspace > 0: while vspace > 0:
body.insert(index, etree.Element('br')) body.insert(index, etree.Element(XHTML('br')))
vspace -= 1 vspace -= 1
if istate.halign != 'auto': if istate.halign != 'auto':
para.attrib['align'] = istate.halign para.attrib['align'] = istate.halign
@ -182,7 +192,7 @@ class MobiMLizer(object):
if tag in CONTENT_TAGS: if tag in CONTENT_TAGS:
bstate.inline = para bstate.inline = para
pstate = bstate.istate = None pstate = bstate.istate = None
etree.SubElement(para, tag, attrib=istate.attrib) etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
elif tag in TABLE_TAGS: elif tag in TABLE_TAGS:
para.attrib['valign'] = 'top' para.attrib['valign'] = 'top'
if not text: if not text:
@ -197,20 +207,21 @@ class MobiMLizer(object):
elif pstate and pstate.href == href: elif pstate and pstate.href == href:
inline = bstate.anchor inline = bstate.anchor
else: else:
inline = etree.SubElement(inline, 'a', href=href) inline = etree.SubElement(inline, XHTML('a'), href=href)
bstate.anchor = inline bstate.anchor = inline
if valign == 'super': if valign == 'super':
inline = etree.SubElement(inline, 'sup') inline = etree.SubElement(inline, XHTML('sup'))
elif valign == 'sub': elif valign == 'sub':
inline = etree.SubElement(inline, 'sub') inline = etree.SubElement(inline, XHTML('sub'))
if istate.family == 'monospace': if istate.family == 'monospace':
inline = etree.SubElement(inline, 'tt') inline = etree.SubElement(inline, XHTML('tt'))
if fsize != 3: if fsize != 3:
inline = etree.SubElement(inline, 'font', size=str(fsize)) inline = etree.SubElement(inline, XHTML('font'),
size=str(fsize))
if istate.italic: if istate.italic:
inline = etree.SubElement(inline, 'i') inline = etree.SubElement(inline, XHTML('i'))
if istate.bold: if istate.bold:
inline = etree.SubElement(inline, 'b') inline = etree.SubElement(inline, XHTML('b'))
bstate.inline = inline bstate.inline = inline
bstate.istate = istate bstate.istate = istate
inline = bstate.inline inline = bstate.inline
@ -353,7 +364,7 @@ class MobiMLizer(object):
if isblock: if isblock:
para = bstate.para para = bstate.para
if para is not None and para.text == u'\xa0': if para is not None and para.text == u'\xa0':
para.getparent().replace(para, etree.Element('br')) para.getparent().replace(para, etree.Element(XHTML('br')))
bstate.para = None bstate.para = None
bstate.istate = None bstate.istate = None
vmargin = asfloat(style['margin-bottom']) vmargin = asfloat(style['margin-bottom'])

View File

@ -26,6 +26,7 @@ from calibre.ebooks.oeb.base import FauxLogger, OEBBook
from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS, MBP, MobiMLizer from calibre.ebooks.mobi.mobiml import MBP_NS, MBP, MobiMLizer
@ -66,23 +67,28 @@ def encode(data):
return data.encode('utf-8') return data.encode('utf-8')
# Almost like the one for MS LIT, but not quite. # Almost like the one for MS LIT, but not quite.
def decint(value): DECINT_FORWARD = 0
DECINT_BACKWARD = 1
def decint(value, direction):
bytes = [] bytes = []
while True: while True:
b = value & 0x7f b = value & 0x7f
value >>= 7 value >>= 7
if not bytes: bytes.append(b)
b |= 0x80
bytes.append(chr(b))
if value == 0: if value == 0:
break break
return ''.join(reversed(bytes)) if direction == DECINT_FORWARD:
bytes[0] |= 0x80
elif direction == DECINT_BACKWARD:
bytes[-1] |= 0x80
return ''.join(chr(b) for b in reversed(bytes))
class Serializer(object): class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images): def __init__(self, oeb, images):
oeb.logger.info('Serializing markup content...')
self.oeb = oeb self.oeb = oeb
self.images = images self.images = images
self.id_offsets = {} self.id_offsets = {}
@ -238,22 +244,11 @@ class MobiWriter(object):
self._oeb = oeb self._oeb = oeb
self._stream = stream self._stream = stream
self._records = [None] self._records = [None]
self._remove_html_cover()
self._generate_content() self._generate_content()
self._generate_record0() self._generate_record0()
self._write_header() self._write_header()
self._write_content() self._write_content()
def _remove_html_cover(self):
oeb = self._oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
oeb.manifest.remove(item)
def _generate_content(self): def _generate_content(self):
self._map_image_names() self._map_image_names()
self._generate_text() self._generate_text()
@ -318,11 +313,17 @@ class MobiWriter(object):
running = offset running = offset
while breaks and (breaks[0] - offset) < RECORD_SIZE: while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3 pbreak = (breaks.pop(0) - running) >> 3
encoded = decint(pbreak) encoded = decint(pbreak, DECINT_FORWARD)
record.write(encoded) record.write(encoded)
running += pbreak << 3 running += pbreak << 3
nextra += len(encoded) nextra += len(encoded)
record.write(decint(nextra + 1)) lsize = 1
while True:
size = decint(nextra + lsize, DECINT_BACKWARD)
if len(size) == lsize:
break
lsize += 1
record.write(size)
self._records.append(record.getvalue()) self._records.append(record.getvalue())
nrecords += 1 nrecords += 1
offset += RECORD_SIZE offset += RECORD_SIZE
@ -479,10 +480,12 @@ def main(argv=sys.argv):
flattener = CSSFlattener(fbase=fbase, fkey=fkey, unfloat=True, flattener = CSSFlattener(fbase=fbase, fkey=fkey, unfloat=True,
untable=True) untable=True)
rasterizer = SVGRasterizer() rasterizer = SVGRasterizer()
trimmer = ManifestTrimmer()
mobimlizer = MobiMLizer() mobimlizer = MobiMLizer()
#flattener.transform(oeb, context) flattener.transform(oeb, context)
rasterizer.transform(oeb, context) rasterizer.transform(oeb, context)
#mobimlizer.transform(oeb, context) mobimlizer.transform(oeb, context)
trimmer.transform(oeb, context)
writer.dump(oeb, outpath) writer.dump(oeb, outpath)
return 0 return 0

View File

@ -125,12 +125,17 @@ def urlnormalize(href):
class OEBError(Exception): class OEBError(Exception):
pass pass
class FauxLogger(object): class FauxLogger(object):
def __getattr__(self, name): def __getattr__(self, name):
return self return self
def __call__(self, message): def __call__(self, message):
print message print message
class Logger(LoggingInterface, object):
def __getattr__(self, name):
return object.__getattribute__(self, 'log_' + name)
class AbstractContainer(object): class AbstractContainer(object):
def read_xml(self, path): def read_xml(self, path):
@ -745,19 +750,19 @@ class OEBBook(object):
self.uid = item self.uid = item
break break
else: else:
self.logger.log_warn(u'Unique-identifier %r not found.' % uid) self.logger.warn(u'Unique-identifier %r not found.' % uid)
for ident in metadata.identifier: for ident in metadata.identifier:
if 'id' in ident.attrib: if 'id' in ident.attrib:
self.uid = metadata.identifier[0] self.uid = metadata.identifier[0]
break break
if not metadata.language: if not metadata.language:
self.logger.log_warn(u'Language not specified.') self.logger.warn(u'Language not specified.')
metadata.add('language', 'en') metadata.add('language', 'en')
if not metadata.creator: if not metadata.creator:
self.logger.log_warn(u'Creator not specified.') self.logger.warn(u'Creator not specified.')
metadata.add('creator', 'Unknown') metadata.add('creator', 'Unknown')
if not metadata.title: if not metadata.title:
self.logger.log_warn(u'Title not specified.') self.logger.warn(u'Title not specified.')
metadata.add('title', 'Unknown') metadata.add('title', 'Unknown')
def _manifest_from_opf(self, opf): def _manifest_from_opf(self, opf):
@ -765,7 +770,7 @@ class OEBBook(object):
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
href = elem.get('href') href = elem.get('href')
if not self.container.exists(href): if not self.container.exists(href):
self.logger.log_warn(u'Manifest item %r not found.' % href) self.logger.warn(u'Manifest item %r not found.' % href)
continue continue
manifest.add(elem.get('id'), href, elem.get('media-type'), manifest.add(elem.get('id'), href, elem.get('media-type'),
elem.get('fallback')) elem.get('fallback'))
@ -775,7 +780,7 @@ class OEBBook(object):
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref') idref = elem.get('idref')
if idref not in self.manifest: if idref not in self.manifest:
self.logger.log_warn(u'Spine item %r not found.' % idref) self.logger.warn(u'Spine item %r not found.' % idref)
continue continue
item = self.manifest[idref] item = self.manifest[idref]
spine.add(item, elem.get('linear')) spine.add(item, elem.get('linear'))
@ -794,7 +799,7 @@ class OEBBook(object):
href = elem.get('href') href = elem.get('href')
path, frag = urldefrag(href) path, frag = urldefrag(href)
if path not in self.manifest.hrefs: if path not in self.manifest.hrefs:
self.logger.log_warn(u'Guide reference %r not found' % href) self.logger.warn(u'Guide reference %r not found' % href)
continue continue
guide.add(elem.get('type'), elem.get('title'), href) guide.add(elem.get('type'), elem.get('title'), href)
@ -993,7 +998,6 @@ class OEBBook(object):
NCX_MIME: (href, ncx)} NCX_MIME: (href, ncx)}
def main(argv=sys.argv): def main(argv=sys.argv):
for arg in argv[1:]: for arg in argv[1:]:
oeb = OEBBook(arg) oeb = OEBBook(arg)

View File

@ -88,6 +88,7 @@ class CSSFlattener(object):
self.untable = untable self.untable = untable
def transform(self, oeb, context): def transform(self, oeb, context):
oeb.logger.info('Flattening CSS and remapping font sizes...')
self.oeb = oeb self.oeb = oeb
self.context = context self.context = context
self.stylize_spine() self.stylize_spine()

View File

@ -33,6 +33,7 @@ class SVGRasterizer(object):
QApplication([]) QApplication([])
def transform(self, oeb, context): def transform(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...')
self.oeb = oeb self.oeb = oeb
self.profile = context.dest self.profile = context.dest
self.images = {} self.images = {}
@ -143,6 +144,9 @@ class SVGRasterizer(object):
if key in self.images: if key in self.images:
href = self.images[key] href = self.images[key]
else: else:
logger = self.oeb.logger
logger.info('Rasterizing %r to %dx%d'
% (svgitem.href, size.width(), size.height()))
image = QImage(size, QImage.Format_ARGB32_Premultiplied) image = QImage(size, QImage.Format_ARGB32_Premultiplied)
image.fill(QColor("white").rgb()) image.fill(QColor("white").rgb())
painter = QPainter(image) painter = QPainter(image)

View File

@ -0,0 +1,40 @@
'''
OPF manifest trimming transform.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from lxml import etree
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME
LINK_SELECTORS = []
for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data',
'//*/@xl:href'):
LINK_SELECTORS.append(etree.XPath(expr, namespaces=XPNSMAP))
class ManifestTrimmer(object):
def transform(self, oeb, context):
oeb.logger.info('Trimming unused files from manifest...')
used = set()
for item in oeb.spine:
used.add(item.href)
for selector in LINK_SELECTORS:
for href in selector(item.data):
used.add(item.abshref(href))
# TODO: Things mentioned in CSS
# TODO: Things mentioned in SVG
# Who knows what people will do...
for term in oeb.metadata:
for item in oeb.metadata[term]:
if item.value in oeb.manifest.hrefs:
used.add(item.value)
elif item.value in oeb.manifest.ids:
used.add(oeb.manifest.ids[item.value].href)
for item in oeb.manifest.values():
if item.href not in used:
oeb.logger.info('Trimming %r from manifest' % item.href)
oeb.manifest.remove(item)