Further improvements to Mobi markup conversion.

This commit is contained in:
Marshall T. Vandegrift 2009-01-03 12:23:13 -05:00
parent 35320295f8
commit b637be07be
5 changed files with 84 additions and 31 deletions

View File

@ -21,6 +21,7 @@ def MBP(name): return '{%s}%s' % (MBP_NS, name)
HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td']) NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td'])
SPECIAL_TAGS = set(['hr', 'br'])
CONTENT_TAGS = set(['img', 'hr', 'br']) CONTENT_TAGS = set(['img', 'hr', 'br'])
PAGE_BREAKS = set(['always', 'odd', 'even']) PAGE_BREAKS = set(['always', 'odd', 'even'])
@ -59,6 +60,9 @@ class FormatState(object):
and self.href == other.href \ and self.href == other.href \
and self.valign == other.valign and self.valign == other.valign
def __ne__(self, other):
return not self.__eq__(other)
class MobiMLizer(object): class MobiMLizer(object):
def __init__(self): def __init__(self):
@ -86,14 +90,26 @@ class MobiMLizer(object):
return self.fnums[self.fmap[ptsize]] return self.fnums[self.fmap[ptsize]]
def mobimlize_measure(self, ptsize): def mobimlize_measure(self, ptsize):
# All MobiML measures occur in the default font-space
if isinstance(ptsize, basestring): if isinstance(ptsize, basestring):
return ptsize return ptsize
return "%dem" % int(round(ptsize / self.profile.fbase)) # All MobiML measures occur in the default font-space
fbase = self.profile.fbase
if ptsize < fbase:
return "%dpt" % int(round(ptsize * 2))
return "%dem" % int(round(ptsize / fbase))
def mobimlize_content(self, tag, text, bstate, istates): def mobimlize_content(self, tag, text, bstate, istates):
istate = istates[-1] istate = istates[-1]
if bstate.para is None: if istate.ids:
body = bstate.body
index = max((0, len(body) - 2))
for id in istate.ids:
body.insert(index, etree.Element('a', attrib={'id': id}))
istate.ids.clear()
para = bstate.para
if tag in SPECIAL_TAGS and not text:
para = para if para is not None else bstate.body
elif para is None:
bstate.istate = None bstate.istate = None
if bstate.pbreak: if bstate.pbreak:
etree.SubElement(bstate.body, MBP('pagebreak')) etree.SubElement(bstate.body, MBP('pagebreak'))
@ -102,7 +118,7 @@ class MobiMLizer(object):
parent = bstate.nested[-1] if bstate.nested else bstate.body parent = bstate.nested[-1] if bstate.nested else bstate.body
para = wrapper = etree.SubElement(parent, tag) para = wrapper = etree.SubElement(parent, tag)
bstate.nested.append(para) bstate.nested.append(para)
elif bstate.left > 0: elif bstate.left > 0 and istate.indent >= 0:
para = wrapper = etree.SubElement(bstate.body, 'blockquote') para = wrapper = etree.SubElement(bstate.body, 'blockquote')
left = int(round(bstate.left / self.profile.fbase)) - 1 left = int(round(bstate.left / self.profile.fbase)) - 1
while left > 0: while left > 0:
@ -118,16 +134,7 @@ class MobiMLizer(object):
para.attrib['width'] = self.mobimlize_measure(istate.indent) para.attrib['width'] = self.mobimlize_measure(istate.indent)
if istate.halign != 'auto': if istate.halign != 'auto':
wrapper.attrib['align'] = istate.halign wrapper.attrib['align'] = istate.halign
if istate.ids:
wrapper.attrib['id'] = istate.ids.pop()
pstate = bstate.istate pstate = bstate.istate
para = bstate.para
if istate.ids:
body = bstate.body
index = max((0, len(body) - 2))
for id in istate.ids:
body.insert(index, etree.Element('a', attrib={'id': id}))
istate.ids.clear()
if tag in CONTENT_TAGS: if tag in CONTENT_TAGS:
bstate.inline = para bstate.inline = para
pstate = bstate.istate = None pstate = bstate.istate = None
@ -143,7 +150,7 @@ class MobiMLizer(object):
inline = etree.SubElement(inline, 'sup') inline = etree.SubElement(inline, 'sup')
elif valign == 'sub': elif valign == 'sub':
inline = etree.SubElement(inline, 'sub') inline = etree.SubElement(inline, 'sub')
elif fsize != 3: if fsize != 3:
inline = etree.SubElement(inline, 'font', size=str(fsize)) inline = etree.SubElement(inline, 'font', size=str(fsize))
if istate.italic: if istate.italic:
inline = etree.SubElement(inline, 'i') inline = etree.SubElement(inline, 'i')

View File

@ -313,7 +313,10 @@ class MobiReader(object):
while flags: while flags:
if flags & 1: if flags & 1:
num += sizeof_trailing_entry(data, size - num) num += sizeof_trailing_entry(data, size - num)
flags >>= 1 flags >>= 1
# Flag indicates overlapping multibyte character data
if self.book_header.extra_flags & 1:
num += ord(data[size - num - 1]) + 1
return num return num
def text_section(self, index): def text_section(self, index):

View File

@ -50,9 +50,7 @@ PALMDOC = 2
HUFFDIC = 17480 HUFFDIC = 17480
def encode(data): def encode(data):
# Using UTF-8 means needing to worry about multibyte characters crossing return data.encode('utf-8')
# record boundaries, so let's not for now.
return data.encode('ascii', 'xmlcharrefreplace')
# Almost like the one for MS LIT, but not quite. # Almost like the one for MS LIT, but not quite.
def decint(value): def decint(value):
@ -193,8 +191,8 @@ class Serializer(object):
class MobiWriter(object): class MobiWriter(object):
def __init__(self, compress=None, logger=FauxLogger()): def __init__(self, compression=None, logger=FauxLogger()):
self._compress = compress or UNCOMPRESSED self._compression = compression or UNCOMPRESSED
self._logger = logger self._logger = logger
def dump(self, oeb, path): def dump(self, oeb, path):
@ -231,7 +229,39 @@ class MobiWriter(object):
if item.media_type.startswith('image/'): if item.media_type.startswith('image/'):
images[item.href] = index images[item.href] = index
index += 1 index += 1
def _read_text_record(self, text):
pos = text.tell()
text.seek(0, 2)
npos = min((pos + RECORD_SIZE, text.tell()))
last = ''
while not last.decode('utf-8', 'ignore'):
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
text.seek(pos)
return text.read(RECORD_SIZE)
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(pos)
data = text.read(RECORD_SIZE + extra)
text.seek(npos)
return data
def _generate_text(self): def _generate_text(self):
serializer = Serializer(self._oeb, self._images) serializer = Serializer(self._oeb, self._images)
breaks = serializer.breaks breaks = serializer.breaks
@ -240,14 +270,14 @@ class MobiWriter(object):
text = StringIO(text) text = StringIO(text)
nrecords = 0 nrecords = 0
offset = 0 offset = 0
data = text.read(RECORD_SIZE) data = self._read_text_record(text)
while len(data) > 0: while len(data) > 0:
if self._compress == PALMDOC: size = len(data)
if self._compression == PALMDOC:
data = compress_doc(data) data = compress_doc(data)
record = StringIO() record = StringIO()
record.write(data) record.write(data)
# Without the NUL Mobipocket Desktop 6.2 will thrash. Why? record.write(pack('>B', max((0, size - RECORD_SIZE))))
record.write('\0')
nextra = 0 nextra = 0
pbreak = 0 pbreak = 0
running = offset running = offset
@ -261,7 +291,7 @@ class MobiWriter(object):
self._records.append(record.getvalue()) self._records.append(record.getvalue())
nrecords += 1 nrecords += 1
offset += RECORD_SIZE offset += RECORD_SIZE
data = text.read(RECORD_SIZE) data = self._read_text_record(text)
self._text_nrecords = nrecords self._text_nrecords = nrecords
def _rescale_image(self, data, maxsizeb, dimen=None): def _rescale_image(self, data, maxsizeb, dimen=None):
@ -304,8 +334,8 @@ class MobiWriter(object):
metadata = self._oeb.metadata metadata = self._oeb.metadata
exth = self._build_exth() exth = self._build_exth()
record0 = StringIO() record0 = StringIO()
record0.write(pack('>HHIHHHH', self._compress, 0, self._text_length, record0.write(pack('>HHIHHHH', self._compression, 0,
self._text_nrecords, RECORD_SIZE, 0, 0)) self._text_length, self._text_nrecords, RECORD_SIZE, 0, 0))
uid = random.randint(0, 0xffffffff) uid = random.randint(0, 0xffffffff)
title = str(metadata.title[0]) title = str(metadata.title[0])
record0.write('MOBI') record0.write('MOBI')
@ -320,7 +350,11 @@ class MobiWriter(object):
record0.write(pack('>I', 0x50)) record0.write(pack('>I', 0x50))
record0.write('\0' * 32) record0.write('\0' * 32)
record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0)) record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0))
# TODO: What the hell are these fields? # The '5' is a bitmask of extra record data at the end:
# - 0x1: <extra multibyte bytes><size> (?)
# - 0x4: <uncrossable breaks><size>
# Of course, the formats aren't quite the same.
# TODO: What the hell are the rest of these fields?
record0.write(pack('>IIIIIIIIIIIIIIIII', record0.write(pack('>IIIIIIIIIIIIIIIII',
0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 5, 0xffffffff)) 0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 5, 0xffffffff))
@ -391,9 +425,10 @@ class MobiWriter(object):
def main(argv=sys.argv): def main(argv=sys.argv):
from calibre.ebooks.oeb.base import DirWriter from calibre.ebooks.oeb.base import DirWriter
inpath, outpath = argv[1:] inpath, outpath = argv[1:]
context = Context('MSReader', 'Cybook3') context = Context('MSReader', 'MobiDesktop')
oeb = OEBBook(inpath) oeb = OEBBook(inpath)
writer = MobiWriter(compress=PALMDOC) #writer = MobiWriter(compression=PALMDOC)
writer = MobiWriter(compression=UNCOMPRESSED)
#writer = DirWriter() #writer = DirWriter()
fbase = context.dest.fbase fbase = context.dest.fbase
fkey = context.dest.fnums.values() fkey = context.dest.fnums.values()

View File

@ -39,6 +39,11 @@ PROFILES = {
Profile(width=480, height=652, dpi=100.0, fbase=13, Profile(width=480, height=652, dpi=100.0, fbase=13,
fsizes=[10, 11, 13, 16, 18, 20, 22, 26]), fsizes=[10, 11, 13, 16, 18, 20, 22, 26]),
# Not really, but let's pretend
'MobiDesktop':
Profile(width=340, height=400, dpi=100, fbase=12,
fsizes=[9, 10, 11, 12, 14, 17, 20, 24]),
# No clue on usable screen size and DPI # No clue on usable screen size and DPI
'Cybook3': 'Cybook3':
Profile(width=584, height=754, dpi=168.451, fbase=12, Profile(width=584, height=754, dpi=168.451, fbase=12,

View File

@ -170,6 +170,9 @@ class CSSFlattener(object):
left -= style['text-indent'] left -= style['text-indent']
if self.unfloat and 'float' in cssdict and tag != 'img': if self.unfloat and 'float' in cssdict and tag != 'img':
del cssdict['float'] del cssdict['float']
if 'vertical-align' in cssdict:
if cssdict['vertical-align'] == 'sup':
cssdict['vertical-align'] = 'super'
if self.lineh and 'line-height' not in cssdict: if self.lineh and 'line-height' not in cssdict:
lineh = self.lineh / psize lineh = self.lineh / psize
cssdict['line-height'] = "%0.5fem" % lineh cssdict['line-height'] = "%0.5fem" % lineh