Significantly improve Mobipocket content conversion.

This commit is contained in:
Marshall T. Vandegrift 2009-01-02 09:15:44 -05:00
parent fc59f7b63d
commit c4582ed2a8
3 changed files with 40 additions and 14 deletions

View File

@ -23,6 +23,8 @@ HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td']) NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td'])
CONTENT_TAGS = set(['img', 'hr', 'br']) CONTENT_TAGS = set(['img', 'hr', 'br'])
PAGE_BREAKS = set(['always', 'odd', 'even'])
COLLAPSE = re.compile(r'[ \t\r\n\v]+') COLLAPSE = re.compile(r'[ \t\r\n\v]+')
class BlockState(object): class BlockState(object):
@ -34,6 +36,7 @@ class BlockState(object):
self.vpadding = 0. self.vpadding = 0.
self.vmargin = 0. self.vmargin = 0.
self.left = 0. self.left = 0.
self.pbreak = False
self.istate = None self.istate = None
class FormatState(object): class FormatState(object):
@ -92,6 +95,9 @@ class MobiMLizer(object):
istate = istates[-1] istate = istates[-1]
if bstate.para is None: if bstate.para is None:
bstate.istate = None bstate.istate = None
if bstate.pbreak:
etree.SubElement(bstate.body, MBP('pagebreak'))
bstate.pbreak = False
if tag in NESTABLE_TAGS: if tag in NESTABLE_TAGS:
parent = bstate.nested[-1] if bstate.nested else bstate.body parent = bstate.nested[-1] if bstate.nested else bstate.body
para = wrapper = etree.SubElement(parent, tag) para = wrapper = etree.SubElement(parent, tag)
@ -117,8 +123,11 @@ class MobiMLizer(object):
pstate = bstate.istate pstate = bstate.istate
para = bstate.para para = bstate.para
if istate.ids: if istate.ids:
body = bstate.body
index = max((0, len(body) - 2))
for id in istate.ids: for id in istate.ids:
etree.SubElement(para, 'a', id=id) body.insert(index, etree.Element('a', attrib={'id': id}))
istate.ids.clear()
if tag in CONTENT_TAGS: if tag in CONTENT_TAGS:
bstate.inline = para bstate.inline = para
pstate = bstate.istate = None pstate = bstate.istate = None
@ -176,9 +185,12 @@ class MobiMLizer(object):
left = margin + padding left = margin + padding
bstate.left += left bstate.left += left
bstate.vmargin = max((bstate.vmargin, style['margin-top'])) bstate.vmargin = max((bstate.vmargin, style['margin-top']))
if style['padding-top']: padding = style['padding-top']
if isinstance(padding, (int, float)) and padding > 0:
bstate.vpadding += bstate.vmargin bstate.vpadding += bstate.vmargin
bstate.vpadding = style['padding-top'] bstate.vpadding = padding
if style['page-break-before'] in PAGE_BREAKS:
bstate.pbreak = True
istate.fsize = self.mobimlize_font(style['font-size']) istate.fsize = self.mobimlize_font(style['font-size'])
istate.italic = True if style['font-style'] == 'italic' else False istate.italic = True if style['font-style'] == 'italic' else False
weight = style['font-weight'] weight = style['font-weight']
@ -207,6 +219,7 @@ class MobiMLizer(object):
istate.attrib.clear() istate.attrib.clear()
if tag == 'img' and 'src' in elem.attrib: if tag == 'img' and 'src' in elem.attrib:
istate.attrib['src'] = elem.attrib['src'] istate.attrib['src'] = elem.attrib['src']
istate.attrib['align'] = 'baseline'
if tag == 'hr' and 'width' in style.cssdict(): if tag == 'hr' and 'width' in style.cssdict():
istate.attrib['width'] = mobimlize_measure(style['width']) istate.attrib['width'] = mobimlize_measure(style['width'])
text = None text = None
@ -225,19 +238,25 @@ class MobiMLizer(object):
if child.tail: if child.tail:
if istate.preserve: if istate.preserve:
tail = child.tail tail = child.tail
elif bstate.para is None and child.text.isspace(): elif bstate.para is None and child.tail.isspace():
tail = None tail = None
else: else:
tail = COLLAPSE.sub(' ', child.tail) tail = COLLAPSE.sub(' ', child.tail)
if tail: if tail:
self.mobimlize_content(tag, tail, bstate, istates) self.mobimlize_content(tag, tail, bstate, istates)
if style['page-break-after'] in PAGE_BREAKS:
bstate.pbreak = True
if isblock: if isblock:
para = bstate.para
if para is not None and para.text == u'\xa0':
para.getparent().replace(para, etree.Element('br'))
bstate.para = None bstate.para = None
bstate.left -= left bstate.left -= left
bstate.vmargin = max((bstate.vmargin, style['margin-bottom'])) bstate.vmargin = max((bstate.vmargin, style['margin-bottom']))
if style['padding-bottom']: padding = style['padding-bottom']
if isinstance(padding, (int, float)) and padding > 0:
bstate.vpadding += bstate.vmargin bstate.vpadding += bstate.vmargin
bstate.vpadding = style['padding-bottom'] bstate.vpadding = padding
if bstate.nested: if bstate.nested:
bstate.nested.pop() bstate.nested.pop()
istates.pop() istates.pop()

View File

@ -50,7 +50,8 @@ PALMDOC = 2
HUFFDIC = 17480 HUFFDIC = 17480
def encode(data): def encode(data):
return data.encode('ascii', 'xmlcharrefreplace') #return data.encode('ascii', 'xmlcharrefreplace')
return data.encode('utf-8')
# Almost like the one for MS LIT, but not quite. # Almost like the one for MS LIT, but not quite.
def decint(value): def decint(value):
@ -92,8 +93,12 @@ class Serializer(object):
def serialize_guide(self): def serialize_guide(self):
buffer = self.buffer buffer = self.buffer
hrefs = self.oeb.manifest.hrefs
buffer.write('<guide>') buffer.write('<guide>')
for ref in self.oeb.guide.values(): for ref in self.oeb.guide.values():
path, frag = urldefrag(ref.href)
if hrefs[path].media_type not in OEB_DOCS:
continue
buffer.write('<reference title="%s" type="%s" ' buffer.write('<reference title="%s" type="%s" '
% (ref.title, ref.type)) % (ref.title, ref.type))
self.serialize_href(ref.href) self.serialize_href(ref.href)
@ -238,19 +243,21 @@ class MobiWriter(object):
while len(data) > 0: while len(data) > 0:
if self._compress == PALMDOC: if self._compress == PALMDOC:
data = compress_doc(data) data = compress_doc(data)
record = StringIO()
record.write(data)
# Without the NUL Mobipocket Desktop 6.2 will thrash. Why? # Without the NUL Mobipocket Desktop 6.2 will thrash. Why?
record = [data, '\0'] record.write('\0')
nextra = 0 nextra = 0
pbreak = 0 pbreak = 0
running = offset running = offset
while breaks and (breaks[0] - offset) < RECORD_SIZE: while breaks and (breaks[0] - offset) < RECORD_SIZE:
pbreak = (breaks.pop(0) - running) >> 3 pbreak = (breaks.pop(0) - running) >> 3
encoded = decint(pbreak) encoded = decint(pbreak)
record.append(encoded) record.write(encoded)
running += pbreak << 3 running += pbreak << 3
nextra += len(encoded) nextra += len(encoded)
record.append(decint(nextra + 1)) record.write(decint(nextra + 1))
self._records.append(''.join(record)) self._records.append(record.getvalue())
nrecords += 1 nrecords += 1
offset += RECORD_SIZE offset += RECORD_SIZE
data = text.read(RECORD_SIZE) data = text.read(RECORD_SIZE)
@ -385,7 +392,7 @@ def main(argv=sys.argv):
inpath, outpath = argv[1:] inpath, outpath = argv[1:]
context = Context('MSReader', 'Cybook3') context = Context('MSReader', 'Cybook3')
oeb = OEBBook(inpath) oeb = OEBBook(inpath)
writer = MobiWriter() writer = MobiWriter(compress=PALMDOC)
#writer = DirWriter() #writer = DirWriter()
fbase = context.dest.fbase fbase = context.dest.fbase
fkey = context.dest.fnums.values() fkey = context.dest.fnums.values()

View File

@ -42,7 +42,7 @@ PROFILES = {
# No clue on usable screen size and DPI # No clue on usable screen size and DPI
'Cybook3': 'Cybook3':
Profile(width=584, height=754, dpi=168.451, fbase=12, Profile(width=584, height=754, dpi=168.451, fbase=12,
fsizes=[9, 10, 11, 12, 14, 16, 18, 20]), fsizes=[9, 10, 11, 12, 14, 17, 20, 24]),
'Firefox': 'Firefox':
Profile(width=800, height=600, dpi=100.0, fbase=12, Profile(width=800, height=600, dpi=100.0, fbase=12,