From e61c667505319bf3af8fbab575b0e7165a5bd143 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Mon, 29 Dec 2008 02:56:00 -0500 Subject: [PATCH] Link serialization. Basic image embedding. --- src/calibre/ebooks/lit/oeb.py | 26 +++++- src/calibre/ebooks/mobi/writer.py | 140 ++++++++++++++++++++++++------ 2 files changed, 137 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/lit/oeb.py b/src/calibre/ebooks/lit/oeb.py index acc3275876..39b22b7e7f 100644 --- a/src/calibre/ebooks/lit/oeb.py +++ b/src/calibre/ebooks/lit/oeb.py @@ -48,6 +48,8 @@ OEB_CSS_MIME = 'text/x-oeb1-css' OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) +MS_COVER_TYPE = 'other.ms-coverimage-standard' + def element(parent, *args, **kwargs): if parent is not None: @@ -153,7 +155,7 @@ class Metadata(object): % (barename(self.term), self.value, self.attrib) def __str__(self): - return str(self.value) + return self.value.encode('ascii', 'xmlcharrefreplace') def __unicode__(self): return unicode(self.value) @@ -687,6 +689,26 @@ class OEBBook(object): if self._toc_from_tour(opf): return if self._toc_from_html(opf): return self._toc_from_spine(opf) + + def _ensure_cover_image(self): + cover = None + if self.metadata.cover: + id = str(self.metadata.cover[0]) + cover = self.manifest[id] + elif MS_COVER_TYPE in self.guide: + href = self.guide[MS_COVER_TYPE].href + cover = self.manifest.hrefs[href] + elif 'cover' in self.guide: + href = self.guide['cover'].href + cover = self.manifest.hrefs[href] + else: + html = self.spine[0].data + imgs = xpath(html, '//h:img[position()=1]') + href = imgs[0].get('src') if imgs else None + cover = self.manifest.hrefs[href] if href else None + if cover: + if not self.metadata.cover: + self.metadata.add('cover', cover.id) def _all_from_opf(self, opf): self._metadata_from_opf(opf) @@ -694,6 +716,7 @@ class OEBBook(object): self._spine_from_opf(opf) self._guide_from_opf(opf) self._toc_from_opf(opf) + self._ensure_cover_image() def to_opf1(self): package = etree.Element('package', @@ -757,6 +780,7 @@ class OEBBook(object): NCX_MIME: (href, ncx)} + def main(argv=sys.argv): for arg in argv[1:]: oeb = OEBBook(arg) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index b16fc8e2c4..614764718d 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -15,10 +15,12 @@ import random from cStringIO import StringIO import re from itertools import izip, count +from collections import defaultdict +from urlparse import urldefrag from lxml import etree from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.lit.oeb import XHTML, XHTML_NS, OEB_DOCS -from calibre.ebooks.lit.oeb import barename, namespace +from calibre.ebooks.lit.oeb import xpath, barename, namespace from calibre.ebooks.lit.oeb import FauxLogger, OEBBook MBP_NS = 'http://mobipocket.cam/ns/mbp' @@ -43,31 +45,105 @@ UNCOMPRESSED = 1 PALMDOC = 2 HUFFDIC = 17480 +COLLAPSE = re.compile(r'[ \t\r\n\v]+') + +def encode(data): + return COLLAPSE.sub(' ', data).encode('ascii', 'xmlcharrefreplace') + class Serializer(object): def __init__(self, oeb, images): self.oeb = oeb self.images = images - self.root = etree.Element(XHTML('html'), - nsmap={None: XHTML_NS, 'mbp': MBP_NS}) - self.generate_head() - self.generate_body() + self.id_offsets = {} + self.href_offsets = defaultdict(list) + buffer = self.buffer = StringIO() + buffer.write('') + self.serialize_head() + self.serialize_body() + buffer.write('') + self.fixup_links() + self.raw = buffer.getvalue() def __str__(self): - return etree.tostring(self.root) + return self.raw - def generate_head(self): - head = etree.SubElement(self.root, XHTML('head')) - - def generate_body(self): - body = etree.SubElement(self.root, XHTML('body')) - first = True + def serialize_head(self): + buffer = self.buffer + buffer.write('') + buffer.write('') + + def serialize_body(self): + buffer = self.buffer + buffer.write('') for item in self.oeb.spine: - if item.media_type not in OEB_DOCS: continue - for elem in item.data.find(XHTML('body')): - body.append(elem) - etree.SubElement(body, MBP('pagebreak')) + self.serialize_item(item) + buffer.write('') + def serialize_item(self, item): + buffer = self.buffer + buffer.write('') + # TODO: Figure out how to make the 'crossable' stuff work for + # non-"linear" spine items. + self.id_offsets[item.id + '_calibre_top'] = buffer.tell() + for elem in item.data.find(XHTML('body')): + self.serialize_elem(elem, item) + + def serialize_elem(self, elem, item): + ns = namespace(elem.tag) + if ns not in (XHTML_NS, MBP_NS): + return + buffer = self.buffer + hrefs = self.oeb.manifest.hrefs + tag = barename(elem.tag) + if ns == MBP_NS: tag = 'mbp:' + tag + for attr in ('name', 'id'): + if attr in elem.attrib: + id = '_'.join((item.id, elem.attrib[attr])) + self.id_offsets[id] = buffer.tell() + del elem.attrib[attr] + buffer.write('<') + buffer.write(tag) + if elem.attrib: + for attr, val in elem.attrib.items(): + buffer.write(' ') + if attr == 'href': + path, frag = urldefrag(val) + # TODO: Absolute path translation + if not path or path in hrefs: + id = hrefs[path].id if path else item.id + frag = frag if frag else 'calibre_top' + href = '_'.join((id, frag)) + buffer.write('filepos=') + self.href_offsets[href].append(buffer.tell()) + buffer.write('0000000000') + continue + elif attr == 'src' and val in hrefs: + index = self.images[val] + buffer.write('recindex="%05d"' % index) + continue + buffer.write('%s="%s"' % (attr, val)) + if not elem.text and len(elem) == 0: + buffer.write('/>') + return + buffer.write('>') + if elem.text: + buffer.write(encode(elem.text)) + for child in elem: + self.serialize_elem(child, item) + buffer.write('' % tag) + if elem.tail: + buffer.write(encode(elem.tail)) + + def fixup_links(self): + buffer = self.buffer + for id, hoffs in self.href_offsets.items(): + ioff = self.id_offsets[id] + for hoff in hoffs: + buffer.seek(hoff) + buffer.write('%010d' % ioff) + + def preserve(function): def wrapper(self, *args, **kwargs): opos = self._stream.tell() @@ -79,8 +155,8 @@ def preserve(function): return wrapper class MobiWriter(object): - def __init__(self, compress=PALMDOC, logger=FauxLogger()): - self._compress = compress or 1 + def __init__(self, compress=None, logger=FauxLogger()): + self._compress = compress or UNCOMPRESSED self._logger = logger def dump(self, oeb, path): @@ -93,11 +169,6 @@ class MobiWriter(object): for datum in data: self._stream.write(datum) - @preserve - def _writeat(self, pos, *data): - self._stream.seek(pos) - self._write(*data) - def _tell(self): return self._stream.tell() @@ -116,7 +187,7 @@ class MobiWriter(object): self._generate_images() def _map_image_names(self): - index = 0 + index = 1 self._images = images = {} for item in self._oeb.manifest.values(): if item.media_type.startswith('image/'): @@ -140,7 +211,13 @@ class MobiWriter(object): self._text_nrecords = nrecords def _generate_images(self): - pass + images = [(index, href) for href, index in self._images.items()] + images.sort() + for _, href in images: + item = self._oeb.manifest.hrefs[href] + data = item.data + # TODO: Re-size etc images + self._records.append(data) def _generate_record0(self): exth = self._build_exth() @@ -167,8 +244,6 @@ class MobiWriter(object): 0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 1, 0xffffffff)) record0.write(exth) - npad = 4 - (record0.tell() % 4) - if npad < 4: record0.write('\0' * npad) record0.write(title) record0 = record0.getvalue() self._records[0] = record0 + ('\0' * (2452 - len(record0))) @@ -185,8 +260,17 @@ class MobiWriter(object): exth.write(pack('>II', code, len(data) + 8)) exth.write(data) nrecs += 1 + if oeb.metadata.cover: + id = str(oeb.metadata.cover[0]) + href = oeb.manifest[id].href + index = self._images[href] + self._text_nrecords - 1 + exth.write(pack('>III', 0xc9, 0x0c, index)) + nrecs += 1 + trail = exth.tell() % 4 + pad = '' if not trail else '\0' * (4 - trail) exth = exth.getvalue() - return ''.join(['EXTH', pack('>II', len(exth) + 12, nrecs), exth]) + exth = ['EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad] + return ''.join(exth) def _write_header(self): title = str(self._oeb.metadata.title[0])