Link serialization. Basic image embedding.

This commit is contained in:
Marshall T. Vandegrift 2008-12-29 02:56:00 -05:00
parent 61f2854c11
commit e61c667505
2 changed files with 137 additions and 29 deletions

View File

@ -48,6 +48,8 @@ OEB_CSS_MIME = 'text/x-oeb1-css'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
MS_COVER_TYPE = 'other.ms-coverimage-standard'
def element(parent, *args, **kwargs):
if parent is not None:
@ -153,7 +155,7 @@ class Metadata(object):
% (barename(self.term), self.value, self.attrib)
def __str__(self):
return str(self.value)
return self.value.encode('ascii', 'xmlcharrefreplace')
def __unicode__(self):
return unicode(self.value)
@ -688,12 +690,33 @@ class OEBBook(object):
if self._toc_from_html(opf): return
self._toc_from_spine(opf)
def _ensure_cover_image(self):
cover = None
if self.metadata.cover:
id = str(self.metadata.cover[0])
cover = self.manifest[id]
elif MS_COVER_TYPE in self.guide:
href = self.guide[MS_COVER_TYPE].href
cover = self.manifest.hrefs[href]
elif 'cover' in self.guide:
href = self.guide['cover'].href
cover = self.manifest.hrefs[href]
else:
html = self.spine[0].data
imgs = xpath(html, '//h:img[position()=1]')
href = imgs[0].get('src') if imgs else None
cover = self.manifest.hrefs[href] if href else None
if cover:
if not self.metadata.cover:
self.metadata.add('cover', cover.id)
def _all_from_opf(self, opf):
self._metadata_from_opf(opf)
self._manifest_from_opf(opf)
self._spine_from_opf(opf)
self._guide_from_opf(opf)
self._toc_from_opf(opf)
self._ensure_cover_image()
def to_opf1(self):
package = etree.Element('package',
@ -757,6 +780,7 @@ class OEBBook(object):
NCX_MIME: (href, ncx)}
def main(argv=sys.argv):
for arg in argv[1:]:
oeb = OEBBook(arg)

View File

@ -15,10 +15,12 @@ import random
from cStringIO import StringIO
import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
from lxml import etree
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.lit.oeb import XHTML, XHTML_NS, OEB_DOCS
from calibre.ebooks.lit.oeb import barename, namespace
from calibre.ebooks.lit.oeb import xpath, barename, namespace
from calibre.ebooks.lit.oeb import FauxLogger, OEBBook
MBP_NS = 'http://mobipocket.cam/ns/mbp'
@ -43,30 +45,104 @@ UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def encode(data):
return COLLAPSE.sub(' ', data).encode('ascii', 'xmlcharrefreplace')
class Serializer(object):
def __init__(self, oeb, images):
self.oeb = oeb
self.images = images
self.root = etree.Element(XHTML('html'),
nsmap={None: XHTML_NS, 'mbp': MBP_NS})
self.generate_head()
self.generate_body()
self.id_offsets = {}
self.href_offsets = defaultdict(list)
buffer = self.buffer = StringIO()
buffer.write('<html>')
self.serialize_head()
self.serialize_body()
buffer.write('</html>')
self.fixup_links()
self.raw = buffer.getvalue()
def __str__(self):
return etree.tostring(self.root)
return self.raw
def generate_head(self):
head = etree.SubElement(self.root, XHTML('head'))
def serialize_head(self):
buffer = self.buffer
buffer.write('<head>')
buffer.write('</head>')
def generate_body(self):
body = etree.SubElement(self.root, XHTML('body'))
first = True
def serialize_body(self):
buffer = self.buffer
buffer.write('<body>')
for item in self.oeb.spine:
if item.media_type not in OEB_DOCS: continue
self.serialize_item(item)
buffer.write('</body>')
def serialize_item(self, item):
buffer = self.buffer
buffer.write('<mbp:pagebreak/>')
# TODO: Figure out how to make the 'crossable' stuff work for
# non-"linear" spine items.
self.id_offsets[item.id + '_calibre_top'] = buffer.tell()
for elem in item.data.find(XHTML('body')):
body.append(elem)
etree.SubElement(body, MBP('pagebreak'))
self.serialize_elem(elem, item)
def serialize_elem(self, elem, item):
ns = namespace(elem.tag)
if ns not in (XHTML_NS, MBP_NS):
return
buffer = self.buffer
hrefs = self.oeb.manifest.hrefs
tag = barename(elem.tag)
if ns == MBP_NS: tag = 'mbp:' + tag
for attr in ('name', 'id'):
if attr in elem.attrib:
id = '_'.join((item.id, elem.attrib[attr]))
self.id_offsets[id] = buffer.tell()
del elem.attrib[attr]
buffer.write('<')
buffer.write(tag)
if elem.attrib:
for attr, val in elem.attrib.items():
buffer.write(' ')
if attr == 'href':
path, frag = urldefrag(val)
# TODO: Absolute path translation
if not path or path in hrefs:
id = hrefs[path].id if path else item.id
frag = frag if frag else 'calibre_top'
href = '_'.join((id, frag))
buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000')
continue
elif attr == 'src' and val in hrefs:
index = self.images[val]
buffer.write('recindex="%05d"' % index)
continue
buffer.write('%s="%s"' % (attr, val))
if not elem.text and len(elem) == 0:
buffer.write('/>')
return
buffer.write('>')
if elem.text:
buffer.write(encode(elem.text))
for child in elem:
self.serialize_elem(child, item)
buffer.write('</%s>' % tag)
if elem.tail:
buffer.write(encode(elem.tail))
def fixup_links(self):
buffer = self.buffer
for id, hoffs in self.href_offsets.items():
ioff = self.id_offsets[id]
for hoff in hoffs:
buffer.seek(hoff)
buffer.write('%010d' % ioff)
def preserve(function):
def wrapper(self, *args, **kwargs):
@ -79,8 +155,8 @@ def preserve(function):
return wrapper
class MobiWriter(object):
def __init__(self, compress=PALMDOC, logger=FauxLogger()):
self._compress = compress or 1
def __init__(self, compress=None, logger=FauxLogger()):
self._compress = compress or UNCOMPRESSED
self._logger = logger
def dump(self, oeb, path):
@ -93,11 +169,6 @@ class MobiWriter(object):
for datum in data:
self._stream.write(datum)
@preserve
def _writeat(self, pos, *data):
self._stream.seek(pos)
self._write(*data)
def _tell(self):
return self._stream.tell()
@ -116,7 +187,7 @@ class MobiWriter(object):
self._generate_images()
def _map_image_names(self):
index = 0
index = 1
self._images = images = {}
for item in self._oeb.manifest.values():
if item.media_type.startswith('image/'):
@ -140,7 +211,13 @@ class MobiWriter(object):
self._text_nrecords = nrecords
def _generate_images(self):
pass
images = [(index, href) for href, index in self._images.items()]
images.sort()
for _, href in images:
item = self._oeb.manifest.hrefs[href]
data = item.data
# TODO: Re-size etc images
self._records.append(data)
def _generate_record0(self):
exth = self._build_exth()
@ -167,8 +244,6 @@ class MobiWriter(object):
0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 1, 0xffffffff))
record0.write(exth)
npad = 4 - (record0.tell() % 4)
if npad < 4: record0.write('\0' * npad)
record0.write(title)
record0 = record0.getvalue()
self._records[0] = record0 + ('\0' * (2452 - len(record0)))
@ -185,8 +260,17 @@ class MobiWriter(object):
exth.write(pack('>II', code, len(data) + 8))
exth.write(data)
nrecs += 1
if oeb.metadata.cover:
id = str(oeb.metadata.cover[0])
href = oeb.manifest[id].href
index = self._images[href] + self._text_nrecords - 1
exth.write(pack('>III', 0xc9, 0x0c, index))
nrecs += 1
trail = exth.tell() % 4
pad = '' if not trail else '\0' * (4 - trail)
exth = exth.getvalue()
return ''.join(['EXTH', pack('>II', len(exth) + 12, nrecs), exth])
exth = ['EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad]
return ''.join(exth)
def _write_header(self):
title = str(self._oeb.metadata.title[0])