Further improvements to Mobi markup conversion.

2025-06-23 15:30:45 -04:00 · 2009-01-03 12:23:13 -05:00 · 2009-01-03 12:23:13 -05:00 · b637be07be
commit b637be07be
parent 35320295f8
5 changed files with 84 additions and 31 deletions
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@ -21,6 +21,7 @@ def MBP(name): return '{%s}%s' % (MBP_NS, name)
 HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
 NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td'])
 SPECIAL_TAGS = set(['hr', 'br'])
 CONTENT_TAGS = set(['img', 'hr', 'br'])
 PAGE_BREAKS = set(['always', 'odd', 'even'])
@ -59,6 +60,9 @@ class FormatState(object):
               and self.href == other.href \
               and self.valign == other.valign
    def __ne__(self, other):
        return not self.__eq__(other)
 class MobiMLizer(object):
    def __init__(self):
@ -86,14 +90,26 @@ class MobiMLizer(object):
        return self.fnums[self.fmap[ptsize]]
    def mobimlize_measure(self, ptsize):
        # All MobiML measures occur in the default font-space
        if isinstance(ptsize, basestring):
            return ptsize
-        return "%dem" % int(round(ptsize / self.profile.fbase))
+        # All MobiML measures occur in the default font-space
        fbase = self.profile.fbase
        if ptsize < fbase:
            return "%dpt" % int(round(ptsize * 2))
        return "%dem" % int(round(ptsize / fbase))
    def mobimlize_content(self, tag, text, bstate, istates):
        istate = istates[-1]
-        if bstate.para is None:
+        if istate.ids:
            body = bstate.body
            index = max((0, len(body) - 2))
            for id in istate.ids:
                body.insert(index, etree.Element('a', attrib={'id': id}))
            istate.ids.clear()
        para = bstate.para
        if tag in SPECIAL_TAGS and not text:
            para = para if para is not None else bstate.body
        elif para is None:
            bstate.istate = None
            if bstate.pbreak:
                etree.SubElement(bstate.body, MBP('pagebreak'))
@ -102,7 +118,7 @@ class MobiMLizer(object):
                parent = bstate.nested[-1] if bstate.nested else bstate.body
                para = wrapper = etree.SubElement(parent, tag)
                bstate.nested.append(para)
-            elif bstate.left > 0:
+            elif bstate.left > 0 and istate.indent >= 0:
                para = wrapper = etree.SubElement(bstate.body, 'blockquote')
                left = int(round(bstate.left / self.profile.fbase)) - 1
                while left > 0:
@ -118,16 +134,7 @@ class MobiMLizer(object):
            para.attrib['width'] = self.mobimlize_measure(istate.indent)
            if istate.halign != 'auto':
                wrapper.attrib['align'] = istate.halign
            if istate.ids:
                wrapper.attrib['id'] = istate.ids.pop()
        pstate = bstate.istate
        para = bstate.para
        if istate.ids:
            body = bstate.body
            index = max((0, len(body) - 2))
            for id in istate.ids:
                body.insert(index, etree.Element('a', attrib={'id': id}))
            istate.ids.clear()
        if tag in CONTENT_TAGS:
            bstate.inline = para
            pstate = bstate.istate = None
@ -143,7 +150,7 @@ class MobiMLizer(object):
                inline = etree.SubElement(inline, 'sup')
            elif valign == 'sub':
                inline = etree.SubElement(inline, 'sub')
-            elif fsize != 3:
+            if fsize != 3:
                inline = etree.SubElement(inline, 'font', size=str(fsize))
            if istate.italic:
                inline = etree.SubElement(inline, 'i')
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -313,7 +313,10 @@ class MobiReader(object):
        while flags:
            if flags & 1:
                num += sizeof_trailing_entry(data, size - num)
-            flags >>= 1        
+            flags >>= 1
        # Flag indicates overlapping multibyte character data
        if self.book_header.extra_flags & 1:
            num += ord(data[size - num - 1]) + 1
        return num
    def text_section(self, index):
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -50,9 +50,7 @@ PALMDOC = 2
 HUFFDIC = 17480
 def encode(data):
-    # Using UTF-8 means needing to worry about multibyte characters crossing
+    return data.encode('utf-8')
    # record boundaries, so let's not for now.
    return data.encode('ascii', 'xmlcharrefreplace')
 # Almost like the one for MS LIT, but not quite.
 def decint(value):
@ -193,8 +191,8 @@ class Serializer(object):
 class MobiWriter(object):
-    def __init__(self, compress=None, logger=FauxLogger()):
+    def __init__(self, compression=None, logger=FauxLogger()):
-        self._compress = compress or UNCOMPRESSED
+        self._compression = compression or UNCOMPRESSED
        self._logger = logger
    def dump(self, oeb, path):
@ -231,7 +229,39 @@ class MobiWriter(object):
            if item.media_type.startswith('image/'):
                images[item.href] = index
                index += 1
-        
+
    def _read_text_record(self, text):
        pos = text.tell()
        text.seek(0, 2)
        npos = min((pos + RECORD_SIZE, text.tell()))
        last = ''
        while not last.decode('utf-8', 'ignore'):
            size = len(last) + 1
            text.seek(npos - size)
            last = text.read(size)
        try:
            last.decode('utf-8')
        except UnicodeDecodeError:
            pass
        else:
            text.seek(pos)
            return text.read(RECORD_SIZE)
        prev = len(last)
        while True:
            text.seek(npos - prev)
            last = text.read(len(last) + 1)
            try:
                last.decode('utf-8')
            except UnicodeDecodeError:
                pass
            else:
                break
        extra = len(last) - prev
        text.seek(pos)
        data = text.read(RECORD_SIZE + extra)
        text.seek(npos)
        return data
    def _generate_text(self):
        serializer = Serializer(self._oeb, self._images)
        breaks = serializer.breaks
@ -240,14 +270,14 @@ class MobiWriter(object):
        text = StringIO(text)
        nrecords = 0
        offset = 0
-        data = text.read(RECORD_SIZE)
+        data = self._read_text_record(text)
        while len(data) > 0:
-            if self._compress == PALMDOC:
+            size = len(data)
            if self._compression == PALMDOC:
                data = compress_doc(data)
            record = StringIO()
            record.write(data)
-            # Without the NUL Mobipocket Desktop 6.2 will thrash.  Why?
+            record.write(pack('>B', max((0, size - RECORD_SIZE))))
            record.write('\0')
            nextra = 0
            pbreak = 0
            running = offset
@ -261,7 +291,7 @@ class MobiWriter(object):
            self._records.append(record.getvalue())
            nrecords += 1
            offset += RECORD_SIZE
-            data = text.read(RECORD_SIZE)
+            data = self._read_text_record(text)
        self._text_nrecords = nrecords
    def _rescale_image(self, data, maxsizeb, dimen=None):
@ -304,8 +334,8 @@ class MobiWriter(object):
        metadata = self._oeb.metadata
        exth = self._build_exth()
        record0 = StringIO()
-        record0.write(pack('>HHIHHHH', self._compress, 0, self._text_length,
+        record0.write(pack('>HHIHHHH', self._compression, 0,
-            self._text_nrecords, RECORD_SIZE, 0, 0))
+            self._text_length, self._text_nrecords, RECORD_SIZE, 0, 0))
        uid = random.randint(0, 0xffffffff)
        title = str(metadata.title[0])
        record0.write('MOBI')
@ -320,7 +350,11 @@ class MobiWriter(object):
        record0.write(pack('>I', 0x50))
        record0.write('\0' * 32)
        record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0))
-        # TODO: What the hell are these fields?
+        # The '5' is a bitmask of extra record data at the end:
        #   - 0x1: <extra multibyte bytes><size> (?)
        #   - 0x4: <uncrossable breaks><size>
        # Of course, the formats aren't quite the same.
        # TODO: What the hell are the rest of these fields?
        record0.write(pack('>IIIIIIIIIIIIIIIII',
            0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
            0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 5, 0xffffffff))
@ -391,9 +425,10 @@ class MobiWriter(object):
 def main(argv=sys.argv):
    from calibre.ebooks.oeb.base import DirWriter
    inpath, outpath = argv[1:]
-    context = Context('MSReader', 'Cybook3')
+    context = Context('MSReader', 'MobiDesktop')
    oeb = OEBBook(inpath)
-    writer = MobiWriter(compress=PALMDOC)
+    #writer = MobiWriter(compression=PALMDOC)
    writer = MobiWriter(compression=UNCOMPRESSED)
    #writer = DirWriter()
    fbase = context.dest.fbase
    fkey = context.dest.fnums.values()
--- a/src/calibre/ebooks/oeb/profile.py
+++ b/src/calibre/ebooks/oeb/profile.py
@ -39,6 +39,11 @@ PROFILES = {
        Profile(width=480, height=652, dpi=100.0, fbase=13,
                fsizes=[10, 11, 13, 16, 18, 20, 22, 26]),
    # Not really, but let's pretend
    'MobiDesktop':
        Profile(width=340, height=400, dpi=100, fbase=12,
                fsizes=[9, 10, 11, 12, 14, 17, 20, 24]),
    # No clue on usable screen size and DPI
    'Cybook3':
        Profile(width=584, height=754, dpi=168.451, fbase=12,
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@ -170,6 +170,9 @@ class CSSFlattener(object):
                left -= style['text-indent']
            if self.unfloat and 'float' in cssdict and tag != 'img':
                del cssdict['float']
            if 'vertical-align' in cssdict:
                if cssdict['vertical-align'] == 'sup':
                    cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh