diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 199566271b..b544d6d2e0 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -19,6 +19,8 @@ from calibre.ebooks.oeb.transforms.flatcss import KeyMapper MBP_NS = 'http://mobipocket.com/ns/mbp' def MBP(name): return '{%s}%s' % (MBP_NS, name) +MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS} + HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td', 'th']) TABLE_TAGS = set(['table', 'tr', 'td', 'th']) @@ -77,26 +79,34 @@ class FormatState(object): class MobiMLizer(object): - def __init__(self): - pass - def transform(self, oeb, context): + oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb self.profile = profile = context.dest self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items()) self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys()) + self.remove_html_cover() self.mobimlize_spine() + def remove_html_cover(self): + oeb = self.oeb + if not oeb.metadata.cover \ + or 'cover' not in oeb.guide: + return + href = oeb.guide['cover'].href + del oeb.guide['cover'] + item = oeb.manifest.hrefs[href] + oeb.manifest.remove(item) + def mobimlize_spine(self): for item in self.oeb.spine: stylizer = Stylizer(item.data, item.href, self.oeb, self.profile) - data = item.data - data.remove(data.find(XHTML('head'))) - body = data.find(XHTML('body')) - nbody = etree.Element(XHTML('body')) + body = item.data.find(XHTML('body')) + nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) + nbody = etree.SubElement(nroot, XHTML('body')) self.mobimlize_elem(body, stylizer, BlockState(nbody), [FormatState()]) - data.replace(body, nbody) + item.data = nroot def mobimlize_font(self, ptsize): return self.fnums[self.fmap[ptsize]] @@ -116,7 +126,7 @@ class MobiMLizer(object): lines = text.split('\n') result = lines[:1] for line in lines[1:]: - result.append(etree.Element('br')) + result.append(etree.Element(XHTML('br'))) if line: result.append(line) return result @@ -134,7 +144,7 @@ class MobiMLizer(object): bstate.pbreak = False if istate.ids: for id in istate.ids: - etree.SubElement(body, 'a', attrib={'id': id}) + etree.SubElement(body, XHTML('a'), attrib={'id': id}) istate.ids.clear() bstate.istate = None bstate.anchor = None @@ -147,22 +157,22 @@ class MobiMLizer(object): elif indent != 0 and abs(indent) < self.profile.fbase: indent = (indent / abs(indent)) * self.profile.fbase if tag in NESTABLE_TAGS: - para = wrapper = etree.SubElement(parent, tag) + para = wrapper = etree.SubElement(parent, XHTML(tag)) bstate.nested.append(para) if tag == 'li' and len(istates) > 1: istates[-2].list_num += 1 para.attrib['value'] = str(istates[-2].list_num) elif left > 0 and indent >= 0: - para = wrapper = etree.SubElement(parent, 'blockquote') + para = wrapper = etree.SubElement(parent, XHTML('blockquote')) para = wrapper emleft = int(round(left / self.profile.fbase)) - 1 emleft = min((emleft, 10)) while emleft > 0: - para = etree.SubElement(para, 'blockquote') + para = etree.SubElement(para, XHTML('blockquote')) emleft -= 1 else: ptag = tag if tag in HEADER_TAGS else 'p' - para = wrapper = etree.SubElement(parent, ptag) + para = wrapper = etree.SubElement(parent, XHTML(ptag)) bstate.inline = bstate.para = para vspace = bstate.vpadding + bstate.vmargin bstate.vpadding = bstate.vmargin = 0 @@ -174,7 +184,7 @@ class MobiMLizer(object): vspace = int(round(vspace / self.profile.fbase)) index = max((0, len(body) - 1)) while vspace > 0: - body.insert(index, etree.Element('br')) + body.insert(index, etree.Element(XHTML('br'))) vspace -= 1 if istate.halign != 'auto': para.attrib['align'] = istate.halign @@ -182,7 +192,7 @@ class MobiMLizer(object): if tag in CONTENT_TAGS: bstate.inline = para pstate = bstate.istate = None - etree.SubElement(para, tag, attrib=istate.attrib) + etree.SubElement(para, XHTML(tag), attrib=istate.attrib) elif tag in TABLE_TAGS: para.attrib['valign'] = 'top' if not text: @@ -197,20 +207,21 @@ class MobiMLizer(object): elif pstate and pstate.href == href: inline = bstate.anchor else: - inline = etree.SubElement(inline, 'a', href=href) + inline = etree.SubElement(inline, XHTML('a'), href=href) bstate.anchor = inline if valign == 'super': - inline = etree.SubElement(inline, 'sup') + inline = etree.SubElement(inline, XHTML('sup')) elif valign == 'sub': - inline = etree.SubElement(inline, 'sub') + inline = etree.SubElement(inline, XHTML('sub')) if istate.family == 'monospace': - inline = etree.SubElement(inline, 'tt') + inline = etree.SubElement(inline, XHTML('tt')) if fsize != 3: - inline = etree.SubElement(inline, 'font', size=str(fsize)) + inline = etree.SubElement(inline, XHTML('font'), + size=str(fsize)) if istate.italic: - inline = etree.SubElement(inline, 'i') + inline = etree.SubElement(inline, XHTML('i')) if istate.bold: - inline = etree.SubElement(inline, 'b') + inline = etree.SubElement(inline, XHTML('b')) bstate.inline = inline bstate.istate = istate inline = bstate.inline @@ -353,7 +364,7 @@ class MobiMLizer(object): if isblock: para = bstate.para if para is not None and para.text == u'\xa0': - para.getparent().replace(para, etree.Element('br')) + para.getparent().replace(para, etree.Element(XHTML('br'))) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index ed732b200d..c616e4041c 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -26,6 +26,7 @@ from calibre.ebooks.oeb.base import FauxLogger, OEBBook from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer +from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.mobiml import MBP_NS, MBP, MobiMLizer @@ -66,23 +67,28 @@ def encode(data): return data.encode('utf-8') # Almost like the one for MS LIT, but not quite. -def decint(value): +DECINT_FORWARD = 0 +DECINT_BACKWARD = 1 +def decint(value, direction): bytes = [] while True: b = value & 0x7f value >>= 7 - if not bytes: - b |= 0x80 - bytes.append(chr(b)) + bytes.append(b) if value == 0: break - return ''.join(reversed(bytes)) + if direction == DECINT_FORWARD: + bytes[0] |= 0x80 + elif direction == DECINT_BACKWARD: + bytes[-1] |= 0x80 + return ''.join(chr(b) for b in reversed(bytes)) class Serializer(object): NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} def __init__(self, oeb, images): + oeb.logger.info('Serializing markup content...') self.oeb = oeb self.images = images self.id_offsets = {} @@ -238,22 +244,11 @@ class MobiWriter(object): self._oeb = oeb self._stream = stream self._records = [None] - self._remove_html_cover() self._generate_content() self._generate_record0() self._write_header() self._write_content() - def _remove_html_cover(self): - oeb = self._oeb - if not oeb.metadata.cover \ - or 'cover' not in oeb.guide: - return - href = oeb.guide['cover'].href - del oeb.guide['cover'] - item = oeb.manifest.hrefs[href] - oeb.manifest.remove(item) - def _generate_content(self): self._map_image_names() self._generate_text() @@ -318,11 +313,17 @@ class MobiWriter(object): running = offset while breaks and (breaks[0] - offset) < RECORD_SIZE: pbreak = (breaks.pop(0) - running) >> 3 - encoded = decint(pbreak) + encoded = decint(pbreak, DECINT_FORWARD) record.write(encoded) running += pbreak << 3 nextra += len(encoded) - record.write(decint(nextra + 1)) + lsize = 1 + while True: + size = decint(nextra + lsize, DECINT_BACKWARD) + if len(size) == lsize: + break + lsize += 1 + record.write(size) self._records.append(record.getvalue()) nrecords += 1 offset += RECORD_SIZE @@ -479,10 +480,12 @@ def main(argv=sys.argv): flattener = CSSFlattener(fbase=fbase, fkey=fkey, unfloat=True, untable=True) rasterizer = SVGRasterizer() + trimmer = ManifestTrimmer() mobimlizer = MobiMLizer() - #flattener.transform(oeb, context) + flattener.transform(oeb, context) rasterizer.transform(oeb, context) - #mobimlizer.transform(oeb, context) + mobimlizer.transform(oeb, context) + trimmer.transform(oeb, context) writer.dump(oeb, outpath) return 0 diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 0fb6a3755b..f5262c977f 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -125,12 +125,17 @@ def urlnormalize(href): class OEBError(Exception): pass + class FauxLogger(object): def __getattr__(self, name): return self def __call__(self, message): print message +class Logger(LoggingInterface, object): + def __getattr__(self, name): + return object.__getattribute__(self, 'log_' + name) + class AbstractContainer(object): def read_xml(self, path): @@ -745,19 +750,19 @@ class OEBBook(object): self.uid = item break else: - self.logger.log_warn(u'Unique-identifier %r not found.' % uid) + self.logger.warn(u'Unique-identifier %r not found.' % uid) for ident in metadata.identifier: if 'id' in ident.attrib: self.uid = metadata.identifier[0] break if not metadata.language: - self.logger.log_warn(u'Language not specified.') + self.logger.warn(u'Language not specified.') metadata.add('language', 'en') if not metadata.creator: - self.logger.log_warn(u'Creator not specified.') + self.logger.warn(u'Creator not specified.') metadata.add('creator', 'Unknown') if not metadata.title: - self.logger.log_warn(u'Title not specified.') + self.logger.warn(u'Title not specified.') metadata.add('title', 'Unknown') def _manifest_from_opf(self, opf): @@ -765,7 +770,7 @@ class OEBBook(object): for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): href = elem.get('href') if not self.container.exists(href): - self.logger.log_warn(u'Manifest item %r not found.' % href) + self.logger.warn(u'Manifest item %r not found.' % href) continue manifest.add(elem.get('id'), href, elem.get('media-type'), elem.get('fallback')) @@ -775,7 +780,7 @@ class OEBBook(object): for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in self.manifest: - self.logger.log_warn(u'Spine item %r not found.' % idref) + self.logger.warn(u'Spine item %r not found.' % idref) continue item = self.manifest[idref] spine.add(item, elem.get('linear')) @@ -794,7 +799,7 @@ class OEBBook(object): href = elem.get('href') path, frag = urldefrag(href) if path not in self.manifest.hrefs: - self.logger.log_warn(u'Guide reference %r not found' % href) + self.logger.warn(u'Guide reference %r not found' % href) continue guide.add(elem.get('type'), elem.get('title'), href) @@ -993,7 +998,6 @@ class OEBBook(object): NCX_MIME: (href, ncx)} - def main(argv=sys.argv): for arg in argv[1:]: oeb = OEBBook(arg) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 86ae61d314..98f918aebf 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -88,6 +88,7 @@ class CSSFlattener(object): self.untable = untable def transform(self, oeb, context): + oeb.logger.info('Flattening CSS and remapping font sizes...') self.oeb = oeb self.context = context self.stylize_spine() diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index b57736beb9..1e6b6944e3 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -33,6 +33,7 @@ class SVGRasterizer(object): QApplication([]) def transform(self, oeb, context): + oeb.logger.info('Rasterizing SVG images...') self.oeb = oeb self.profile = context.dest self.images = {} @@ -143,6 +144,9 @@ class SVGRasterizer(object): if key in self.images: href = self.images[key] else: + logger = self.oeb.logger + logger.info('Rasterizing %r to %dx%d' + % (svgitem.href, size.width(), size.height())) image = QImage(size, QImage.Format_ARGB32_Premultiplied) image.fill(QColor("white").rgb()) painter = QPainter(image) diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py new file mode 100644 index 0000000000..062eed359b --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -0,0 +1,40 @@ +''' +OPF manifest trimming transform. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys +import os +from lxml import etree +from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME + +LINK_SELECTORS = [] +for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data', + '//*/@xl:href'): + LINK_SELECTORS.append(etree.XPath(expr, namespaces=XPNSMAP)) + +class ManifestTrimmer(object): + def transform(self, oeb, context): + oeb.logger.info('Trimming unused files from manifest...') + used = set() + for item in oeb.spine: + used.add(item.href) + for selector in LINK_SELECTORS: + for href in selector(item.data): + used.add(item.abshref(href)) + # TODO: Things mentioned in CSS + # TODO: Things mentioned in SVG + # Who knows what people will do... + for term in oeb.metadata: + for item in oeb.metadata[term]: + if item.value in oeb.manifest.hrefs: + used.add(item.value) + elif item.value in oeb.manifest.ids: + used.add(oeb.manifest.ids[item.value].href) + for item in oeb.manifest.values(): + if item.href not in used: + oeb.logger.info('Trimming %r from manifest' % item.href) + oeb.manifest.remove(item)