Merge current state of oeb2mobi.

This commit is contained in:
Marshall T. Vandegrift 2008-12-30 19:46:23 -05:00
commit a8060652c8
5 changed files with 637 additions and 13 deletions

View File

@ -25,7 +25,8 @@ from calibre.ebooks.lit.reader import DirectoryEntry
import calibre.ebooks.lit.maps as maps
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_CSS_MIME, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.oeb.base import namespace, barename, urlnormalize, xpath
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath
from calibre.ebooks.oeb.base import FauxLogger, OEBBook
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.lit.lzx import Compressor
@ -116,12 +117,6 @@ LZXC_CONTROL = \
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
def prefixname(name, nsrmap):
prefix = nsrmap[namespace(name)]
if not prefix:
return barename(name)
return ':'.join((prefix, barename(name)))
def decint(value):
bytes = []
while True:

View File

@ -3,6 +3,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from struct import pack
main_language = {
0 : "NEUTRAL",
54 : "AFRIKAANS",
@ -155,5 +157,168 @@ sub_language = {
2 : "SWEDISH_FINLAND",
1 : "UZBEK_LATIN",
2 : "UZBEK_CYRILLIC",
}
}
IANA_MOBI = \
{None: {None: (0, 0)},
'af': {None: (54, 0)},
'ar': {None: (1, 0),
'AE': (1, 56),
'BH': (1, 60),
'DZ': (1, 20),
'EG': (1, 12),
'JO': (1, 44),
'KW': (1, 52),
'LB': (1, 48),
'MA': (1, 24),
'OM': (1, 32),
'QA': (1, 64),
'SA': (1, 4),
'SY': (1, 40),
'TN': (1, 28),
'YE': (1, 36)},
'as': {None: (77, 0)},
'az': {None: (44, 0)},
'be': {None: (35, 0)},
'bg': {None: (2, 0)},
'bn': {None: (69, 0)},
'ca': {None: (3, 0)},
'cs': {None: (5, 0)},
'da': {None: (6, 0)},
'de': {None: (7, 0),
'AT': (7, 12),
'CH': (7, 8),
'LI': (7, 20),
'LU': (7, 16)},
'el': {None: (8, 0)},
'en': {None: (9, 0),
'AU': (9, 12),
'BZ': (9, 40),
'CA': (9, 16),
'GB': (9, 8),
'IE': (9, 24),
'JM': (9, 32),
'NZ': (9, 20),
'PH': (9, 52),
'TT': (9, 44),
'US': (9, 4),
'ZA': (9, 28),
'ZW': (9, 48)},
'es': {None: (10, 0),
'AR': (10, 44),
'BO': (10, 64),
'CL': (10, 52),
'CO': (10, 36),
'CR': (10, 20),
'DO': (10, 28),
'EC': (10, 48),
'ES': (10, 4),
'GT': (10, 16),
'HN': (10, 72),
'MX': (10, 8),
'NI': (10, 76),
'PA': (10, 24),
'PE': (10, 40),
'PR': (10, 80),
'PY': (10, 60),
'SV': (10, 68),
'UY': (10, 56),
'VE': (10, 32)},
'et': {None: (37, 0)},
'eu': {None: (45, 0)},
'fa': {None: (41, 0)},
'fi': {None: (11, 0)},
'fo': {None: (56, 0)},
'fr': {None: (12, 0),
'BE': (12, 8),
'CA': (12, 12),
'CH': (12, 16),
'FR': (12, 4),
'LU': (12, 20),
'MC': (12, 24)},
'gu': {None: (71, 0)},
'he': {None: (13, 0)},
'hi': {None: (57, 0)},
'hr': {None: (26, 0)},
'hu': {None: (14, 0)},
'hy': {None: (43, 0)},
'id': {None: (33, 0)},
'is': {None: (15, 0)},
'it': {None: (16, 0),
'CH': (16, 8),
'IT': (16, 4)},
'ja': {None: (17, 0)},
'ka': {None: (55, 0)},
'kk': {None: (63, 0)},
'kn': {None: (75, 0)},
'ko': {None: (18, 0)},
'kok': {None: (87, 0)},
'lt': {None: (39, 0)},
'lv': {None: (38, 0)},
'mk': {None: (47, 0)},
'ml': {None: (76, 0)},
'mr': {None: (78, 0)},
'ms': {None: (62, 0)},
'mt': {None: (58, 0)},
'ne': {None: (97, 0)},
'nl': {None: (19, 0),
'BE': (19, 8)},
'no': {None: (20, 0)},
'or': {None: (72, 0)},
'pa': {None: (70, 0)},
'pl': {None: (21, 0)},
'pt': {None: (22, 0),
'BR': (22, 4),
'PT': (22, 8)},
'rm': {None: (23, 0)},
'ro': {None: (24, 0)},
'ru': {None: (25, 0)},
'sa': {None: (79, 0)},
'se': {None: (59, 0)},
'sk': {None: (27, 0)},
'sl': {None: (36, 0)},
'sq': {None: (28, 0)},
'sr': {None: (26, 12),
'RS': (26, 12)},
'st': {None: (48, 0)},
'sv': {None: (29, 0),
'FI': (29, 8)},
'sw': {None: (65, 0)},
'ta': {None: (73, 0)},
'te': {None: (74, 0)},
'th': {None: (30, 0)},
'tn': {None: (50, 0)},
'tr': {None: (31, 0)},
'ts': {None: (49, 0)},
'tt': {None: (68, 0)},
'uk': {None: (34, 0)},
'ur': {None: (32, 0)},
'uz': {None: (67, 0),
'UZ': (67, 8)},
'vi': {None: (42, 0)},
'wen': {None: (46, 0)},
'xh': {None: (52, 0)},
'zh': {None: (4, 0),
'CN': (4, 8),
'HK': (4, 12),
'SG': (4, 16),
'TW': (4, 4)},
'zu': {None: (53, 0)}}
def iana2mobi(self, icode):
subtags = list(code.split('-'))
langdict = IANA_MOBI[None]
while len(subtags) > 0:
lang = subtags.pop(0).lower()
if lang in IANA_MOBI:
langdict = IANA_MOBI[lang]
break
mcode = langdict[None]
while len(subtags) > 0:
subtag = subtags.pop(0)
if subtag not in langdict:
subtag = subtag.upper()
if subtag in langdict:
mcode = langdict[subtag]
break
return pack('>HBB', 0, mcode[1], mcode[0])

View File

@ -2,7 +2,11 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
from cStringIO import StringIO
from struct import pack
COUNT_BITS = 3
@ -31,4 +35,53 @@ def decompress_doc(data):
res.append(res[j - di+k])
return ''.join([chr(i) for i in res])
def compress_doc(data):
out = StringIO()
i = 0
ldata = len(data)
while i < ldata:
if i > 10 and (ldata - i) > 10:
chunk = ''
match = -1
for j in xrange(10, 2, -1):
chunk = data[i:i+j]
try:
match = data.rindex(chunk, 0, i)
except ValueError:
continue
if (i - match) <= 2047:
break
match = -1
if match >= 0:
n = len(chunk)
m = i - match
code = 0x8000 + ((m << 3) & 0x3ff8) + (n - 3)
out.write(pack('>H', code))
i += n
continue
ch = data[i]
och = ord(ch)
i += 1
if ch == ' ' and (i + 1) < ldata:
onch = ord(data[i])
if onch >= 0x40 and onch < 0x80:
out.write(pack('>B', onch ^ 0x80))
i += 1
continue
if och == 0 or (och >= 9 and och < 0x80):
out.write(ch)
else:
j = i
binseq = [ch]
while True:
ch = data[j]
och = ord(ch)
if och < 1 or (och > 8 and och < 0x80):
break
binseq.append(ch)
out.write(pack('>B', len(binseq)))
out.write(''.join(binseq))
i += len(binseq) - 1
return out.getvalue()

View File

@ -0,0 +1,356 @@
'''
Write content to Mobipocket books.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
import sys
import os
from struct import pack
import functools
import time
import random
from cStringIO import StringIO
import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
from lxml import etree
from PIL import Image
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.lit.oeb import XML_NS, XHTML, XHTML_NS, OEB_DOCS
from calibre.ebooks.lit.oeb import xpath, barename, namespace, prefixname
from calibre.ebooks.lit.oeb import FauxLogger, OEBBook
MBP_NS = 'http://mobipocket.com/ns/mbp'
def MBP(name): return '{%s}%s' % (MBP_NS, name)
EXTH_CODES = {
'creator': 100,
'publisher': 101,
'description': 103,
'identifier': 104,
'subject': 105,
'date': 106,
'review': 107,
'contributor': 108,
'rights': 109,
'type': 111,
'source': 112,
'title': 503,
}
UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
def encode(data):
return data.encode('ascii', 'xmlcharrefreplace')
class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'}
def __init__(self, oeb, images):
self.oeb = oeb
self.images = images
self.id_offsets = {}
self.href_offsets = defaultdict(list)
buffer = self.buffer = StringIO()
buffer.write('<html>')
self.serialize_head()
self.serialize_body()
buffer.write('</html>')
self.fixup_links()
self.raw = buffer.getvalue()
def __str__(self):
return self.raw
def serialize_head(self):
buffer = self.buffer
buffer.write('<head>')
if len(self.oeb.guide) > 0:
self.serialize_guide()
buffer.write('</head>')
def serialize_guide(self):
buffer = self.buffer
buffer.write('<guide>')
for ref in self.oeb.guide.values():
buffer.write('<reference title="%s" type="%s" '
% (ref.title, ref.type))
self.serialize_href(ref.href)
buffer.write('/>')
buffer.write('</guide>')
def serialize_href(self, href, base=None):
hrefs = self.oeb.manifest.hrefs
path, frag = urldefrag(href)
if path and base:
path = base.abshref(path)
if path and path not in hrefs:
return False
buffer = self.buffer
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
id = item.id if item else base.id
frag = frag if frag else 'calibre_top'
href = '#'.join((id, frag))
buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000')
return True
def serialize_body(self):
buffer = self.buffer
buffer.write('<body>')
for item in self.oeb.spine:
self.serialize_item(item)
buffer.write('</body>')
def serialize_item(self, item):
buffer = self.buffer
buffer.write('<mbp:pagebreak/>')
# TODO: Figure out how to make the 'crossable' stuff work for
# non-"linear" spine items.
self.id_offsets[item.id + '#calibre_top'] = buffer.tell()
for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item)
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
if namespace(elem.tag) not in nsrmap:
return
buffer = self.buffer
hrefs = self.oeb.manifest.hrefs
tag = prefixname(elem.tag, nsrmap)
for attr in ('name', 'id'):
if attr in elem.attrib:
id = '#'.join((item.id, elem.attrib[attr]))
self.id_offsets[id] = buffer.tell()
del elem.attrib[attr]
buffer.write('<')
buffer.write(tag)
if elem.attrib:
for attr, val in elem.attrib.items():
if namespace(attr) not in nsrmap:
continue
attr = prefixname(attr, nsrmap)
buffer.write(' ')
if attr == 'href':
if self.serialize_href(val, item):
continue
elif attr == 'src' and val in hrefs:
index = self.images[val]
buffer.write('recindex="%05d"' % index)
continue
buffer.write('%s="%s"' % (attr, val))
if elem.text or len(elem) > 0:
buffer.write('>')
if elem.text:
buffer.write(encode(elem.text))
for child in elem:
self.serialize_elem(child, item)
buffer.write('</%s>' % tag)
else:
buffer.write('/>')
if elem.tail:
buffer.write(encode(elem.tail))
def fixup_links(self):
buffer = self.buffer
for id, hoffs in self.href_offsets.items():
ioff = self.id_offsets[id]
for hoff in hoffs:
buffer.seek(hoff)
buffer.write('%010d' % ioff)
class MobiWriter(object):
def __init__(self, compress=None, logger=FauxLogger()):
self._compress = compress or UNCOMPRESSED
self._logger = logger
def dump(self, oeb, path):
if hasattr(path, 'write'):
return self._dump_stream(oeb, path)
with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream)
def _write(self, *data):
for datum in data:
self._stream.write(datum)
def _tell(self):
return self._stream.tell()
def _dump_stream(self, oeb, stream):
self._oeb = oeb
self._stream = stream
self._records = [None]
self._generate_content()
self._generate_record0()
self._write_header()
self._write_content()
def _generate_content(self):
self._map_image_names()
self._generate_text()
self._generate_images()
def _map_image_names(self):
index = 1
self._images = images = {}
for item in self._oeb.manifest.values():
if item.media_type.startswith('image/'):
images[item.href] = index
index += 1
def _generate_text(self):
serializer = Serializer(self._oeb, self._images)
text = str(serializer)
self._text_length = len(text)
text = StringIO(text)
nrecords = 0
data = text.read(0x1000)
while len(data) > 0:
nrecords += 1
if self._compress == PALMDOC:
data = compress_doc(data)
# Without the NUL Mobipocket Desktop 6.2 will thrash. Why?
self._records.append(data + '\0')
data = text.read(0x1000)
self._text_nrecords = nrecords
def _rescale_image(self, data, maxsizeb, dimen=None):
if dimen is not None:
image = Image.open(StringIO(data))
image.thumbnail(dimen, Image.ANTIALIAS)
data = StringIO()
image.save(data, image.format)
data = data.getvalue()
if len(data) < maxsizeb:
return data
image = Image.open(StringIO(data))
for quality in xrange(95, -1, -1):
data = StringIO()
image.save(data, 'JPEG', quality=quality)
data = data.getvalue()
if len(data) <= maxsizeb:
break
return data
def _generate_images(self):
images = [(index, href) for href, index in self._images.items()]
images.sort()
metadata = self._oeb.metadata
coverid = metadata.cover[0] if metadata.cover else None
for _, href in images:
item = self._oeb.manifest.hrefs[href]
maxsizek = 89 if coverid == item.id else 63
maxsizeb = maxsizek * 1024
data = self._rescale_image(item.data, maxsizeb)
self._records.append(data)
def _generate_record0(self):
metadata = self._oeb.metadata
exth = self._build_exth()
record0 = StringIO()
record0.write(pack('>HHIHHHH', self._compress, 0, self._text_length,
self._text_nrecords, 0x1000, 0, 0))
uid = random.randint(0, 0xffffffff)
title = str(metadata.title[0])
record0.write('MOBI')
record0.write(pack('>IIIII', 0xe8, 2, 65001, uid, 5))
record0.write('\xff' * 40)
record0.write(pack('>I', self._text_nrecords + 1))
record0.write(pack('>II', 0xe8 + 16 + len(exth), len(title)))
record0.write(iana2mobi(str(metadata.language[0])))
record0.write('\0' * 8)
record0.write(pack('>II', 5, self._text_nrecords + 1))
record0.write('\0' * 16)
record0.write(pack('>I', 0x50))
record0.write('\0' * 32)
record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0))
# TODO: What the hell are these fields?
record0.write(pack('>IIIIIIIIIIIIIIIII',
0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 1, 0xffffffff))
record0.write(exth)
record0.write(title)
record0 = record0.getvalue()
self._records[0] = record0 + ('\0' * (2452 - len(record0)))
def _build_exth(self):
oeb = self._oeb
exth = StringIO()
nrecs = 0
for term in oeb.metadata:
if term not in EXTH_CODES: continue
code = EXTH_CODES[term]
for item in oeb.metadata[term]:
data = str(item)
exth.write(pack('>II', code, len(data) + 8))
exth.write(data)
nrecs += 1
if oeb.metadata.cover:
id = str(oeb.metadata.cover[0])
item = oeb.manifest[id]
href = item.href
index = self._images[href] - 1
exth.write(pack('>III', 0xc9, 0x0c, index))
exth.write(pack('>III', 0xcb, 0x0c, 0))
index = self._add_thumbnail(item) - 1
exth.write(pack('>III', 0xca, 0x0c, index))
nrecs += 3
exth = exth.getvalue()
trail = len(exth) % 4
pad = '' if not trail else '\0' * (4 - trail)
exth = ['EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad]
return ''.join(exth)
def _add_thumbnail(self, item):
maxsizeb = 16 * 1024
dimen = (180, 240)
data = self._rescale_image(item.data, maxsizeb, dimen)
manifest = self._oeb.manifest
id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
manifest.add(id, href, 'image/jpeg', data=data)
index = len(self._images) + 1
self._images[href] = index
self._records.append(data)
return index
def _write_header(self):
title = str(self._oeb.metadata.title[0])
title = re.sub('[^-A-Za-z0-9]+', '_', title)[:32]
title = title + ('\0' * (32 - len(title)))
now = int(time.time())
nrecords = len(self._records)
self._write(title, pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0),
'BOOK', 'MOBI', pack('>IIH', nrecords, 0, nrecords))
offset = self._tell() + (8 * nrecords) + 2
for id, record in izip(count(), self._records):
self._write(pack('>I', offset), '\0', pack('>I', id)[1:])
offset += len(record)
self._write('\0\0')
def _write_content(self):
for record in self._records:
self._write(record)
def main(argv=sys.argv):
inpath, outpath = argv[1:]
oeb = OEBBook(inpath)
writer = MobiWriter()
writer.dump(oeb, outpath)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -50,6 +50,8 @@ OPENTYPE_MIME = 'font/opentype'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
MS_COVER_TYPE = 'other.ms-coverimage-standard'
def element(parent, *args, **kwargs):
if parent is not None:
@ -66,6 +68,12 @@ def barename(name):
return name.split('}', 1)[1]
return name
def prefixname(name, nsrmap):
prefix = nsrmap[namespace(name)]
if not prefix:
return barename(name)
return ':'.join((prefix, barename(name)))
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
@ -147,6 +155,7 @@ class Metadata(object):
TERMS = set(['contributor', 'coverage', 'creator', 'date', 'description',
'format', 'identifier', 'language', 'publisher', 'relation',
'rights', 'source', 'subject', 'title', 'type'])
ATTRS = set(['role', 'file-as', 'scheme'])
OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
'xsi': XSI_NS}
@ -163,7 +172,12 @@ class Metadata(object):
self.value = value
self.attrib = attrib = {}
for fq_attr in fq_attrib:
attr = barename(fq_attr)
if fq_attr in Metadata.ATTRS:
attr = fq_attr
fq_attr = OPF2(fq_attr)
fq_attrib[fq_attr] = fq_attrib.pop(attr)
else:
attr = barename(fq_attr)
attrib[attr] = fq_attrib[fq_attr]
def __getattr__(self, name):
@ -180,7 +194,7 @@ class Metadata(object):
% (barename(self.term), self.value, self.attrib)
def __str__(self):
return str(self.value)
return unicode(self.value).encode('ascii', 'xmlcharrefreplace')
def __unicode__(self):
return unicode(self.value)
@ -317,6 +331,14 @@ class Manifest(object):
if frag:
relhref = '#'.join((relhref, frag))
return relhref
def abshref(self, href):
if '/' not in self.href:
return href
dirname = os.path.dirname(self.href)
href = os.path.join(dirname, href)
href = os.path.normpath(href).replace('\\', '/')
return href
def __init__(self, oeb):
self.oeb = oeb
@ -503,6 +525,9 @@ class Guide(object):
def __contains__(self, key):
return key in self.refs
def __len__(self):
return len(self.refs)
def to_opf1(self, parent=None):
elem = element(parent, 'guide')
for ref in self.refs.values():
@ -652,6 +677,15 @@ class OEBBook(object):
else:
self.logger.log_warn(u'Unique-identifier %r not found.' % uid)
self.uid = metadata.identifier[0]
if not metadata.language:
self.logger.log_warn(u'Language not specified.')
metadata.add('language', 'en')
if not metadata.creator:
self.logger.log_warn(u'Creator not specified.')
metadata.add('creator', 'Unknown')
if not metadata.title:
self.logger.log_warn(u'Title not specified.')
metadata.add('title', 'Unknown')
def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self)
@ -789,6 +823,25 @@ class OEBBook(object):
if self._toc_from_tour(opf): return
if self._toc_from_html(opf): return
self._toc_from_spine(opf)
def _ensure_cover_image(self):
cover = None
if self.metadata.cover:
id = str(self.metadata.cover[0])
cover = self.manifest[id]
elif MS_COVER_TYPE in self.guide:
href = self.guide[MS_COVER_TYPE].href
cover = self.manifest.hrefs[href]
elif 'cover' in self.guide:
href = self.guide['cover'].href
cover = self.manifest.hrefs[href]
else:
html = self.spine[0].data
imgs = xpath(html, '//h:img[position()=1]')
href = imgs[0].get('src') if imgs else None
cover = self.manifest.hrefs[href] if href else None
if cover and not self.metadata.cover:
self.metadata.add('cover', cover.id)
def _all_from_opf(self, opf):
self._metadata_from_opf(opf)
@ -796,6 +849,7 @@ class OEBBook(object):
self._spine_from_opf(opf)
self._guide_from_opf(opf)
self._toc_from_opf(opf)
self._ensure_cover_image()
def to_opf1(self):
package = etree.Element('package',
@ -859,6 +913,7 @@ class OEBBook(object):
NCX_MIME: (href, ncx)}
def main(argv=sys.argv):
for arg in argv[1:]:
oeb = OEBBook(arg)