From c4582ed2a8113eeacc5e7ee52d5c8669146f814c Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Fri, 2 Jan 2009 09:15:44 -0500 Subject: [PATCH] Significantly improve Mobipocket content conversion. --- src/calibre/ebooks/mobi/mobiml.py | 31 +++++++++++++++++++++++++------ src/calibre/ebooks/mobi/writer.py | 21 ++++++++++++++------- src/calibre/ebooks/oeb/profile.py | 2 +- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 7d92d7934f..3583ea2f4a 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -23,6 +23,8 @@ HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td']) CONTENT_TAGS = set(['img', 'hr', 'br']) +PAGE_BREAKS = set(['always', 'odd', 'even']) + COLLAPSE = re.compile(r'[ \t\r\n\v]+') class BlockState(object): @@ -34,6 +36,7 @@ class BlockState(object): self.vpadding = 0. self.vmargin = 0. self.left = 0. + self.pbreak = False self.istate = None class FormatState(object): @@ -92,6 +95,9 @@ class MobiMLizer(object): istate = istates[-1] if bstate.para is None: bstate.istate = None + if bstate.pbreak: + etree.SubElement(bstate.body, MBP('pagebreak')) + bstate.pbreak = False if tag in NESTABLE_TAGS: parent = bstate.nested[-1] if bstate.nested else bstate.body para = wrapper = etree.SubElement(parent, tag) @@ -117,8 +123,11 @@ class MobiMLizer(object): pstate = bstate.istate para = bstate.para if istate.ids: + body = bstate.body + index = max((0, len(body) - 2)) for id in istate.ids: - etree.SubElement(para, 'a', id=id) + body.insert(index, etree.Element('a', attrib={'id': id})) + istate.ids.clear() if tag in CONTENT_TAGS: bstate.inline = para pstate = bstate.istate = None @@ -176,9 +185,12 @@ class MobiMLizer(object): left = margin + padding bstate.left += left bstate.vmargin = max((bstate.vmargin, style['margin-top'])) - if style['padding-top']: + padding = style['padding-top'] + if isinstance(padding, (int, float)) and padding > 0: bstate.vpadding += bstate.vmargin - bstate.vpadding = style['padding-top'] + bstate.vpadding = padding + if style['page-break-before'] in PAGE_BREAKS: + bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] @@ -207,6 +219,7 @@ class MobiMLizer(object): istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] + istate.attrib['align'] = 'baseline' if tag == 'hr' and 'width' in style.cssdict(): istate.attrib['width'] = mobimlize_measure(style['width']) text = None @@ -225,19 +238,25 @@ class MobiMLizer(object): if child.tail: if istate.preserve: tail = child.tail - elif bstate.para is None and child.text.isspace(): + elif bstate.para is None and child.tail.isspace(): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) + if style['page-break-after'] in PAGE_BREAKS: + bstate.pbreak = True if isblock: + para = bstate.para + if para is not None and para.text == u'\xa0': + para.getparent().replace(para, etree.Element('br')) bstate.para = None bstate.left -= left bstate.vmargin = max((bstate.vmargin, style['margin-bottom'])) - if style['padding-bottom']: + padding = style['padding-bottom'] + if isinstance(padding, (int, float)) and padding > 0: bstate.vpadding += bstate.vmargin - bstate.vpadding = style['padding-bottom'] + bstate.vpadding = padding if bstate.nested: bstate.nested.pop() istates.pop() diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index de54d979c3..77b382e90a 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -50,7 +50,8 @@ PALMDOC = 2 HUFFDIC = 17480 def encode(data): - return data.encode('ascii', 'xmlcharrefreplace') + #return data.encode('ascii', 'xmlcharrefreplace') + return data.encode('utf-8') # Almost like the one for MS LIT, but not quite. def decint(value): @@ -92,12 +93,16 @@ class Serializer(object): def serialize_guide(self): buffer = self.buffer + hrefs = self.oeb.manifest.hrefs buffer.write('') for ref in self.oeb.guide.values(): + path, frag = urldefrag(ref.href) + if hrefs[path].media_type not in OEB_DOCS: + continue buffer.write('') + buffer.write(' />') buffer.write('') def serialize_href(self, href, base=None): @@ -238,19 +243,21 @@ class MobiWriter(object): while len(data) > 0: if self._compress == PALMDOC: data = compress_doc(data) + record = StringIO() + record.write(data) # Without the NUL Mobipocket Desktop 6.2 will thrash. Why? - record = [data, '\0'] + record.write('\0') nextra = 0 pbreak = 0 running = offset while breaks and (breaks[0] - offset) < RECORD_SIZE: pbreak = (breaks.pop(0) - running) >> 3 encoded = decint(pbreak) - record.append(encoded) + record.write(encoded) running += pbreak << 3 nextra += len(encoded) - record.append(decint(nextra + 1)) - self._records.append(''.join(record)) + record.write(decint(nextra + 1)) + self._records.append(record.getvalue()) nrecords += 1 offset += RECORD_SIZE data = text.read(RECORD_SIZE) @@ -385,7 +392,7 @@ def main(argv=sys.argv): inpath, outpath = argv[1:] context = Context('MSReader', 'Cybook3') oeb = OEBBook(inpath) - writer = MobiWriter() + writer = MobiWriter(compress=PALMDOC) #writer = DirWriter() fbase = context.dest.fbase fkey = context.dest.fnums.values() diff --git a/src/calibre/ebooks/oeb/profile.py b/src/calibre/ebooks/oeb/profile.py index 95fdd5ab9b..1d58900b28 100644 --- a/src/calibre/ebooks/oeb/profile.py +++ b/src/calibre/ebooks/oeb/profile.py @@ -42,7 +42,7 @@ PROFILES = { # No clue on usable screen size and DPI 'Cybook3': Profile(width=584, height=754, dpi=168.451, fbase=12, - fsizes=[9, 10, 11, 12, 14, 16, 18, 20]), + fsizes=[9, 10, 11, 12, 14, 17, 20, 24]), 'Firefox': Profile(width=800, height=600, dpi=100.0, fbase=12,