More fixes and improvements. Etc etc etc.

This commit is contained in:
Marshall T. Vandegrift 2009-01-08 08:47:59 -05:00
parent c02491eddc
commit a7753d3420
6 changed files with 115 additions and 52 deletions

View File

@ -19,6 +19,8 @@ from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name): return '{%s}%s' % (MBP_NS, name)
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS}
HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td', 'th'])
TABLE_TAGS = set(['table', 'tr', 'td', 'th'])
@ -77,26 +79,34 @@ class FormatState(object):
class MobiMLizer(object):
def __init__(self):
pass
def transform(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
self.remove_html_cover()
self.mobimlize_spine()
def remove_html_cover(self):
oeb = self.oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
oeb.manifest.remove(item)
def mobimlize_spine(self):
for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile)
data = item.data
data.remove(data.find(XHTML('head')))
body = data.find(XHTML('body'))
nbody = etree.Element(XHTML('body'))
body = item.data.find(XHTML('body'))
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, XHTML('body'))
self.mobimlize_elem(body, stylizer, BlockState(nbody),
[FormatState()])
data.replace(body, nbody)
item.data = nroot
def mobimlize_font(self, ptsize):
return self.fnums[self.fmap[ptsize]]
@ -116,7 +126,7 @@ class MobiMLizer(object):
lines = text.split('\n')
result = lines[:1]
for line in lines[1:]:
result.append(etree.Element('br'))
result.append(etree.Element(XHTML('br')))
if line:
result.append(line)
return result
@ -134,7 +144,7 @@ class MobiMLizer(object):
bstate.pbreak = False
if istate.ids:
for id in istate.ids:
etree.SubElement(body, 'a', attrib={'id': id})
etree.SubElement(body, XHTML('a'), attrib={'id': id})
istate.ids.clear()
bstate.istate = None
bstate.anchor = None
@ -147,22 +157,22 @@ class MobiMLizer(object):
elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS:
para = wrapper = etree.SubElement(parent, tag)
para = wrapper = etree.SubElement(parent, XHTML(tag))
bstate.nested.append(para)
if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1
para.attrib['value'] = str(istates[-2].list_num)
elif left > 0 and indent >= 0:
para = wrapper = etree.SubElement(parent, 'blockquote')
para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
para = wrapper
emleft = int(round(left / self.profile.fbase)) - 1
emleft = min((emleft, 10))
while emleft > 0:
para = etree.SubElement(para, 'blockquote')
para = etree.SubElement(para, XHTML('blockquote'))
emleft -= 1
else:
ptag = tag if tag in HEADER_TAGS else 'p'
para = wrapper = etree.SubElement(parent, ptag)
para = wrapper = etree.SubElement(parent, XHTML(ptag))
bstate.inline = bstate.para = para
vspace = bstate.vpadding + bstate.vmargin
bstate.vpadding = bstate.vmargin = 0
@ -174,7 +184,7 @@ class MobiMLizer(object):
vspace = int(round(vspace / self.profile.fbase))
index = max((0, len(body) - 1))
while vspace > 0:
body.insert(index, etree.Element('br'))
body.insert(index, etree.Element(XHTML('br')))
vspace -= 1
if istate.halign != 'auto':
para.attrib['align'] = istate.halign
@ -182,7 +192,7 @@ class MobiMLizer(object):
if tag in CONTENT_TAGS:
bstate.inline = para
pstate = bstate.istate = None
etree.SubElement(para, tag, attrib=istate.attrib)
etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
elif tag in TABLE_TAGS:
para.attrib['valign'] = 'top'
if not text:
@ -197,20 +207,21 @@ class MobiMLizer(object):
elif pstate and pstate.href == href:
inline = bstate.anchor
else:
inline = etree.SubElement(inline, 'a', href=href)
inline = etree.SubElement(inline, XHTML('a'), href=href)
bstate.anchor = inline
if valign == 'super':
inline = etree.SubElement(inline, 'sup')
inline = etree.SubElement(inline, XHTML('sup'))
elif valign == 'sub':
inline = etree.SubElement(inline, 'sub')
inline = etree.SubElement(inline, XHTML('sub'))
if istate.family == 'monospace':
inline = etree.SubElement(inline, 'tt')
inline = etree.SubElement(inline, XHTML('tt'))
if fsize != 3:
inline = etree.SubElement(inline, 'font', size=str(fsize))
inline = etree.SubElement(inline, XHTML('font'),
size=str(fsize))
if istate.italic:
inline = etree.SubElement(inline, 'i')
inline = etree.SubElement(inline, XHTML('i'))
if istate.bold:
inline = etree.SubElement(inline, 'b')
inline = etree.SubElement(inline, XHTML('b'))
bstate.inline = inline
bstate.istate = istate
inline = bstate.inline
@ -353,7 +364,7 @@ class MobiMLizer(object):
if isblock:
para = bstate.para
if para is not None and para.text == u'\xa0':
para.getparent().replace(para, etree.Element('br'))
para.getparent().replace(para, etree.Element(XHTML('br')))
bstate.para = None
bstate.istate = None
vmargin = asfloat(style['margin-bottom'])

View File

@ -26,6 +26,7 @@ from calibre.ebooks.oeb.base import FauxLogger, OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS, MBP, MobiMLizer
@ -66,23 +67,28 @@ def encode(data):
return data.encode('utf-8')
# Almost like the one for MS LIT, but not quite.
def decint(value):
DECINT_FORWARD = 0
DECINT_BACKWARD = 1
def decint(value, direction):
bytes = []
while True:
b = value & 0x7f
value >>= 7
if not bytes:
b |= 0x80
bytes.append(chr(b))
bytes.append(b)
if value == 0:
break
return ''.join(reversed(bytes))
if direction == DECINT_FORWARD:
bytes[0] |= 0x80
elif direction == DECINT_BACKWARD:
bytes[-1] |= 0x80
return ''.join(chr(b) for b in reversed(bytes))
class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images):
oeb.logger.info('Serializing markup content...')
self.oeb = oeb
self.images = images
self.id_offsets = {}
@ -238,22 +244,11 @@ class MobiWriter(object):
self._oeb = oeb
self._stream = stream
self._records = [None]
self._remove_html_cover()
self._generate_content()
self._generate_record0()
self._write_header()
self._write_content()
def _remove_html_cover(self):
oeb = self._oeb
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
oeb.manifest.remove(item)
def _generate_content(self):
self._map_image_names()
self._generate_text()
@ -318,11 +313,17 @@ class MobiWriter(object):
running = offset
while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3
encoded = decint(pbreak)
encoded = decint(pbreak, DECINT_FORWARD)
record.write(encoded)
running += pbreak << 3
nextra += len(encoded)
record.write(decint(nextra + 1))
lsize = 1
while True:
size = decint(nextra + lsize, DECINT_BACKWARD)
if len(size) == lsize:
break
lsize += 1
record.write(size)
self._records.append(record.getvalue())
nrecords += 1
offset += RECORD_SIZE
@ -479,10 +480,12 @@ def main(argv=sys.argv):
flattener = CSSFlattener(fbase=fbase, fkey=fkey, unfloat=True,
untable=True)
rasterizer = SVGRasterizer()
trimmer = ManifestTrimmer()
mobimlizer = MobiMLizer()
#flattener.transform(oeb, context)
flattener.transform(oeb, context)
rasterizer.transform(oeb, context)
#mobimlizer.transform(oeb, context)
mobimlizer.transform(oeb, context)
trimmer.transform(oeb, context)
writer.dump(oeb, outpath)
return 0

View File

@ -125,12 +125,17 @@ def urlnormalize(href):
class OEBError(Exception):
pass
class FauxLogger(object):
def __getattr__(self, name):
return self
def __call__(self, message):
print message
class Logger(LoggingInterface, object):
def __getattr__(self, name):
return object.__getattribute__(self, 'log_' + name)
class AbstractContainer(object):
def read_xml(self, path):
@ -745,19 +750,19 @@ class OEBBook(object):
self.uid = item
break
else:
self.logger.log_warn(u'Unique-identifier %r not found.' % uid)
self.logger.warn(u'Unique-identifier %r not found.' % uid)
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.uid = metadata.identifier[0]
break
if not metadata.language:
self.logger.log_warn(u'Language not specified.')
self.logger.warn(u'Language not specified.')
metadata.add('language', 'en')
if not metadata.creator:
self.logger.log_warn(u'Creator not specified.')
self.logger.warn(u'Creator not specified.')
metadata.add('creator', 'Unknown')
if not metadata.title:
self.logger.log_warn(u'Title not specified.')
self.logger.warn(u'Title not specified.')
metadata.add('title', 'Unknown')
def _manifest_from_opf(self, opf):
@ -765,7 +770,7 @@ class OEBBook(object):
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
href = elem.get('href')
if not self.container.exists(href):
self.logger.log_warn(u'Manifest item %r not found.' % href)
self.logger.warn(u'Manifest item %r not found.' % href)
continue
manifest.add(elem.get('id'), href, elem.get('media-type'),
elem.get('fallback'))
@ -775,7 +780,7 @@ class OEBBook(object):
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref')
if idref not in self.manifest:
self.logger.log_warn(u'Spine item %r not found.' % idref)
self.logger.warn(u'Spine item %r not found.' % idref)
continue
item = self.manifest[idref]
spine.add(item, elem.get('linear'))
@ -794,7 +799,7 @@ class OEBBook(object):
href = elem.get('href')
path, frag = urldefrag(href)
if path not in self.manifest.hrefs:
self.logger.log_warn(u'Guide reference %r not found' % href)
self.logger.warn(u'Guide reference %r not found' % href)
continue
guide.add(elem.get('type'), elem.get('title'), href)
@ -993,7 +998,6 @@ class OEBBook(object):
NCX_MIME: (href, ncx)}
def main(argv=sys.argv):
for arg in argv[1:]:
oeb = OEBBook(arg)

View File

@ -88,6 +88,7 @@ class CSSFlattener(object):
self.untable = untable
def transform(self, oeb, context):
oeb.logger.info('Flattening CSS and remapping font sizes...')
self.oeb = oeb
self.context = context
self.stylize_spine()

View File

@ -33,6 +33,7 @@ class SVGRasterizer(object):
QApplication([])
def transform(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...')
self.oeb = oeb
self.profile = context.dest
self.images = {}
@ -143,6 +144,9 @@ class SVGRasterizer(object):
if key in self.images:
href = self.images[key]
else:
logger = self.oeb.logger
logger.info('Rasterizing %r to %dx%d'
% (svgitem.href, size.width(), size.height()))
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
image.fill(QColor("white").rgb())
painter = QPainter(image)

View File

@ -0,0 +1,40 @@
'''
OPF manifest trimming transform.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
import os
from lxml import etree
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME
LINK_SELECTORS = []
for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data',
'//*/@xl:href'):
LINK_SELECTORS.append(etree.XPath(expr, namespaces=XPNSMAP))
class ManifestTrimmer(object):
def transform(self, oeb, context):
oeb.logger.info('Trimming unused files from manifest...')
used = set()
for item in oeb.spine:
used.add(item.href)
for selector in LINK_SELECTORS:
for href in selector(item.data):
used.add(item.abshref(href))
# TODO: Things mentioned in CSS
# TODO: Things mentioned in SVG
# Who knows what people will do...
for term in oeb.metadata:
for item in oeb.metadata[term]:
if item.value in oeb.manifest.hrefs:
used.add(item.value)
elif item.value in oeb.manifest.ids:
used.add(oeb.manifest.ids[item.value].href)
for item in oeb.manifest.values():
if item.href not in used:
oeb.logger.info('Trimming %r from manifest' % item.href)
oeb.manifest.remove(item)