Link serialization. Basic image embedding.

This commit is contained in:
Marshall T. Vandegrift 2008-12-29 02:56:00 -05:00
parent 61f2854c11
commit e61c667505
2 changed files with 137 additions and 29 deletions

View File

@ -48,6 +48,8 @@ OEB_CSS_MIME = 'text/x-oeb1-css'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
MS_COVER_TYPE = 'other.ms-coverimage-standard'
def element(parent, *args, **kwargs): def element(parent, *args, **kwargs):
if parent is not None: if parent is not None:
@ -153,7 +155,7 @@ class Metadata(object):
% (barename(self.term), self.value, self.attrib) % (barename(self.term), self.value, self.attrib)
def __str__(self): def __str__(self):
return str(self.value) return self.value.encode('ascii', 'xmlcharrefreplace')
def __unicode__(self): def __unicode__(self):
return unicode(self.value) return unicode(self.value)
@ -688,12 +690,33 @@ class OEBBook(object):
if self._toc_from_html(opf): return if self._toc_from_html(opf): return
self._toc_from_spine(opf) self._toc_from_spine(opf)
def _ensure_cover_image(self):
cover = None
if self.metadata.cover:
id = str(self.metadata.cover[0])
cover = self.manifest[id]
elif MS_COVER_TYPE in self.guide:
href = self.guide[MS_COVER_TYPE].href
cover = self.manifest.hrefs[href]
elif 'cover' in self.guide:
href = self.guide['cover'].href
cover = self.manifest.hrefs[href]
else:
html = self.spine[0].data
imgs = xpath(html, '//h:img[position()=1]')
href = imgs[0].get('src') if imgs else None
cover = self.manifest.hrefs[href] if href else None
if cover:
if not self.metadata.cover:
self.metadata.add('cover', cover.id)
def _all_from_opf(self, opf): def _all_from_opf(self, opf):
self._metadata_from_opf(opf) self._metadata_from_opf(opf)
self._manifest_from_opf(opf) self._manifest_from_opf(opf)
self._spine_from_opf(opf) self._spine_from_opf(opf)
self._guide_from_opf(opf) self._guide_from_opf(opf)
self._toc_from_opf(opf) self._toc_from_opf(opf)
self._ensure_cover_image()
def to_opf1(self): def to_opf1(self):
package = etree.Element('package', package = etree.Element('package',
@ -757,6 +780,7 @@ class OEBBook(object):
NCX_MIME: (href, ncx)} NCX_MIME: (href, ncx)}
def main(argv=sys.argv): def main(argv=sys.argv):
for arg in argv[1:]: for arg in argv[1:]:
oeb = OEBBook(arg) oeb = OEBBook(arg)

View File

@ -15,10 +15,12 @@ import random
from cStringIO import StringIO from cStringIO import StringIO
import re import re
from itertools import izip, count from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
from lxml import etree from lxml import etree
from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.lit.oeb import XHTML, XHTML_NS, OEB_DOCS from calibre.ebooks.lit.oeb import XHTML, XHTML_NS, OEB_DOCS
from calibre.ebooks.lit.oeb import barename, namespace from calibre.ebooks.lit.oeb import xpath, barename, namespace
from calibre.ebooks.lit.oeb import FauxLogger, OEBBook from calibre.ebooks.lit.oeb import FauxLogger, OEBBook
MBP_NS = 'http://mobipocket.cam/ns/mbp' MBP_NS = 'http://mobipocket.cam/ns/mbp'
@ -43,30 +45,104 @@ UNCOMPRESSED = 1
PALMDOC = 2 PALMDOC = 2
HUFFDIC = 17480 HUFFDIC = 17480
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def encode(data):
return COLLAPSE.sub(' ', data).encode('ascii', 'xmlcharrefreplace')
class Serializer(object): class Serializer(object):
def __init__(self, oeb, images): def __init__(self, oeb, images):
self.oeb = oeb self.oeb = oeb
self.images = images self.images = images
self.root = etree.Element(XHTML('html'), self.id_offsets = {}
nsmap={None: XHTML_NS, 'mbp': MBP_NS}) self.href_offsets = defaultdict(list)
self.generate_head() buffer = self.buffer = StringIO()
self.generate_body() buffer.write('<html>')
self.serialize_head()
self.serialize_body()
buffer.write('</html>')
self.fixup_links()
self.raw = buffer.getvalue()
def __str__(self): def __str__(self):
return etree.tostring(self.root) return self.raw
def generate_head(self): def serialize_head(self):
head = etree.SubElement(self.root, XHTML('head')) buffer = self.buffer
buffer.write('<head>')
buffer.write('</head>')
def generate_body(self): def serialize_body(self):
body = etree.SubElement(self.root, XHTML('body')) buffer = self.buffer
first = True buffer.write('<body>')
for item in self.oeb.spine: for item in self.oeb.spine:
if item.media_type not in OEB_DOCS: continue self.serialize_item(item)
for elem in item.data.find(XHTML('body')): buffer.write('</body>')
body.append(elem)
etree.SubElement(body, MBP('pagebreak')) def serialize_item(self, item):
buffer = self.buffer
buffer.write('<mbp:pagebreak/>')
# TODO: Figure out how to make the 'crossable' stuff work for
# non-"linear" spine items.
self.id_offsets[item.id + '_calibre_top'] = buffer.tell()
for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item)
def serialize_elem(self, elem, item):
ns = namespace(elem.tag)
if ns not in (XHTML_NS, MBP_NS):
return
buffer = self.buffer
hrefs = self.oeb.manifest.hrefs
tag = barename(elem.tag)
if ns == MBP_NS: tag = 'mbp:' + tag
for attr in ('name', 'id'):
if attr in elem.attrib:
id = '_'.join((item.id, elem.attrib[attr]))
self.id_offsets[id] = buffer.tell()
del elem.attrib[attr]
buffer.write('<')
buffer.write(tag)
if elem.attrib:
for attr, val in elem.attrib.items():
buffer.write(' ')
if attr == 'href':
path, frag = urldefrag(val)
# TODO: Absolute path translation
if not path or path in hrefs:
id = hrefs[path].id if path else item.id
frag = frag if frag else 'calibre_top'
href = '_'.join((id, frag))
buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000')
continue
elif attr == 'src' and val in hrefs:
index = self.images[val]
buffer.write('recindex="%05d"' % index)
continue
buffer.write('%s="%s"' % (attr, val))
if not elem.text and len(elem) == 0:
buffer.write('/>')
return
buffer.write('>')
if elem.text:
buffer.write(encode(elem.text))
for child in elem:
self.serialize_elem(child, item)
buffer.write('</%s>' % tag)
if elem.tail:
buffer.write(encode(elem.tail))
def fixup_links(self):
buffer = self.buffer
for id, hoffs in self.href_offsets.items():
ioff = self.id_offsets[id]
for hoff in hoffs:
buffer.seek(hoff)
buffer.write('%010d' % ioff)
def preserve(function): def preserve(function):
def wrapper(self, *args, **kwargs): def wrapper(self, *args, **kwargs):
@ -79,8 +155,8 @@ def preserve(function):
return wrapper return wrapper
class MobiWriter(object): class MobiWriter(object):
def __init__(self, compress=PALMDOC, logger=FauxLogger()): def __init__(self, compress=None, logger=FauxLogger()):
self._compress = compress or 1 self._compress = compress or UNCOMPRESSED
self._logger = logger self._logger = logger
def dump(self, oeb, path): def dump(self, oeb, path):
@ -93,11 +169,6 @@ class MobiWriter(object):
for datum in data: for datum in data:
self._stream.write(datum) self._stream.write(datum)
@preserve
def _writeat(self, pos, *data):
self._stream.seek(pos)
self._write(*data)
def _tell(self): def _tell(self):
return self._stream.tell() return self._stream.tell()
@ -116,7 +187,7 @@ class MobiWriter(object):
self._generate_images() self._generate_images()
def _map_image_names(self): def _map_image_names(self):
index = 0 index = 1
self._images = images = {} self._images = images = {}
for item in self._oeb.manifest.values(): for item in self._oeb.manifest.values():
if item.media_type.startswith('image/'): if item.media_type.startswith('image/'):
@ -140,7 +211,13 @@ class MobiWriter(object):
self._text_nrecords = nrecords self._text_nrecords = nrecords
def _generate_images(self): def _generate_images(self):
pass images = [(index, href) for href, index in self._images.items()]
images.sort()
for _, href in images:
item = self._oeb.manifest.hrefs[href]
data = item.data
# TODO: Re-size etc images
self._records.append(data)
def _generate_record0(self): def _generate_record0(self):
exth = self._build_exth() exth = self._build_exth()
@ -167,8 +244,6 @@ class MobiWriter(object):
0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 1, 0xffffffff)) 0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 1, 0xffffffff))
record0.write(exth) record0.write(exth)
npad = 4 - (record0.tell() % 4)
if npad < 4: record0.write('\0' * npad)
record0.write(title) record0.write(title)
record0 = record0.getvalue() record0 = record0.getvalue()
self._records[0] = record0 + ('\0' * (2452 - len(record0))) self._records[0] = record0 + ('\0' * (2452 - len(record0)))
@ -185,8 +260,17 @@ class MobiWriter(object):
exth.write(pack('>II', code, len(data) + 8)) exth.write(pack('>II', code, len(data) + 8))
exth.write(data) exth.write(data)
nrecs += 1 nrecs += 1
if oeb.metadata.cover:
id = str(oeb.metadata.cover[0])
href = oeb.manifest[id].href
index = self._images[href] + self._text_nrecords - 1
exth.write(pack('>III', 0xc9, 0x0c, index))
nrecs += 1
trail = exth.tell() % 4
pad = '' if not trail else '\0' * (4 - trail)
exth = exth.getvalue() exth = exth.getvalue()
return ''.join(['EXTH', pack('>II', len(exth) + 12, nrecs), exth]) exth = ['EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad]
return ''.join(exth)
def _write_header(self): def _write_header(self):
title = str(self._oeb.metadata.title[0]) title = str(self._oeb.metadata.title[0])