Further improvements to Mobi markup conversion.

2025-06-23 15:30:45 -04:00 · 2009-01-03 12:23:13 -05:00 · 2009-01-03 12:23:13 -05:00 · b637be07be
commit b637be07be
parent 35320295f8
5 changed files with 84 additions and 31 deletions
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@ -21,6 +21,7 @@ def MBP(name): return '{%s}%s' % (MBP_NS, name)

 HEADER_TAGS = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
 NESTABLE_TAGS = set(['ol', 'ul', 'li', 'table', 'tr', 'td'])
+SPECIAL_TAGS = set(['hr', 'br'])
 CONTENT_TAGS = set(['img', 'hr', 'br'])

 PAGE_BREAKS = set(['always', 'odd', 'even'])
@ -59,6 +60,9 @@ class FormatState(object):
               and self.href == other.href \
               and self.valign == other.valign

+    def __ne__(self, other):
+        return not self.__eq__(other)
+

 class MobiMLizer(object):
    def __init__(self):
@ -86,14 +90,26 @@ class MobiMLizer(object):
        return self.fnums[self.fmap[ptsize]]

    def mobimlize_measure(self, ptsize):
-        # All MobiML measures occur in the default font-space
        if isinstance(ptsize, basestring):
            return ptsize
-        return "%dem" % int(round(ptsize / self.profile.fbase))
+        # All MobiML measures occur in the default font-space
+        fbase = self.profile.fbase
+        if ptsize < fbase:
+            return "%dpt" % int(round(ptsize * 2))
+        return "%dem" % int(round(ptsize / fbase))

    def mobimlize_content(self, tag, text, bstate, istates):
        istate = istates[-1]
-        if bstate.para is None:
+        if istate.ids:
+            body = bstate.body
+            index = max((0, len(body) - 2))
+            for id in istate.ids:
+                body.insert(index, etree.Element('a', attrib={'id': id}))
+            istate.ids.clear()
+        para = bstate.para
+        if tag in SPECIAL_TAGS and not text:
+            para = para if para is not None else bstate.body
+        elif para is None:
            bstate.istate = None
            if bstate.pbreak:
                etree.SubElement(bstate.body, MBP('pagebreak'))
@ -102,7 +118,7 @@ class MobiMLizer(object):
                parent = bstate.nested[-1] if bstate.nested else bstate.body
                para = wrapper = etree.SubElement(parent, tag)
                bstate.nested.append(para)
-            elif bstate.left > 0:
+            elif bstate.left > 0 and istate.indent >= 0:
                para = wrapper = etree.SubElement(bstate.body, 'blockquote')
                left = int(round(bstate.left / self.profile.fbase)) - 1
                while left > 0:
@ -118,16 +134,7 @@ class MobiMLizer(object):
            para.attrib['width'] = self.mobimlize_measure(istate.indent)
            if istate.halign != 'auto':
                wrapper.attrib['align'] = istate.halign
-            if istate.ids:
-                wrapper.attrib['id'] = istate.ids.pop()
        pstate = bstate.istate
-        para = bstate.para
-        if istate.ids:
-            body = bstate.body
-            index = max((0, len(body) - 2))
-            for id in istate.ids:
-                body.insert(index, etree.Element('a', attrib={'id': id}))
-            istate.ids.clear()
        if tag in CONTENT_TAGS:
            bstate.inline = para
            pstate = bstate.istate = None
@ -143,7 +150,7 @@ class MobiMLizer(object):
                inline = etree.SubElement(inline, 'sup')
            elif valign == 'sub':
                inline = etree.SubElement(inline, 'sub')
-            elif fsize != 3:
+            if fsize != 3:
                inline = etree.SubElement(inline, 'font', size=str(fsize))
            if istate.italic:
                inline = etree.SubElement(inline, 'i')
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -313,7 +313,10 @@ class MobiReader(object):
        while flags:
            if flags & 1:
                num += sizeof_trailing_entry(data, size - num)
-            flags >>= 1        
+            flags >>= 1
+        # Flag indicates overlapping multibyte character data
+        if self.book_header.extra_flags & 1:
+            num += ord(data[size - num - 1]) + 1
        return num

    def text_section(self, index):
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -50,9 +50,7 @@ PALMDOC = 2
 HUFFDIC = 17480

 def encode(data):
-    # Using UTF-8 means needing to worry about multibyte characters crossing
-    # record boundaries, so let's not for now.
-    return data.encode('ascii', 'xmlcharrefreplace')
+    return data.encode('utf-8')

 # Almost like the one for MS LIT, but not quite.
 def decint(value):
@ -193,8 +191,8 @@ class Serializer(object):

    
 class MobiWriter(object):
-    def __init__(self, compress=None, logger=FauxLogger()):
-        self._compress = compress or UNCOMPRESSED
+    def __init__(self, compression=None, logger=FauxLogger()):
+        self._compression = compression or UNCOMPRESSED
        self._logger = logger

    def dump(self, oeb, path):
@ -231,7 +229,39 @@ class MobiWriter(object):
            if item.media_type.startswith('image/'):
                images[item.href] = index
                index += 1
-        
+
+    def _read_text_record(self, text):
+        pos = text.tell()
+        text.seek(0, 2)
+        npos = min((pos + RECORD_SIZE, text.tell()))
+        last = ''
+        while not last.decode('utf-8', 'ignore'):
+            size = len(last) + 1
+            text.seek(npos - size)
+            last = text.read(size)
+        try:
+            last.decode('utf-8')
+        except UnicodeDecodeError:
+            pass
+        else:
+            text.seek(pos)
+            return text.read(RECORD_SIZE)
+        prev = len(last)
+        while True:
+            text.seek(npos - prev)
+            last = text.read(len(last) + 1)
+            try:
+                last.decode('utf-8')
+            except UnicodeDecodeError:
+                pass
+            else:
+                break
+        extra = len(last) - prev
+        text.seek(pos)
+        data = text.read(RECORD_SIZE + extra)
+        text.seek(npos)
+        return data
+                
    def _generate_text(self):
        serializer = Serializer(self._oeb, self._images)
        breaks = serializer.breaks
@ -240,14 +270,14 @@ class MobiWriter(object):
        text = StringIO(text)
        nrecords = 0
        offset = 0
-        data = text.read(RECORD_SIZE)
+        data = self._read_text_record(text)
        while len(data) > 0:
-            if self._compress == PALMDOC:
+            size = len(data)
+            if self._compression == PALMDOC:
                data = compress_doc(data)
            record = StringIO()
            record.write(data)
-            # Without the NUL Mobipocket Desktop 6.2 will thrash.  Why?
-            record.write('\0')
+            record.write(pack('>B', max((0, size - RECORD_SIZE))))
            nextra = 0
            pbreak = 0
            running = offset
@ -261,7 +291,7 @@ class MobiWriter(object):
            self._records.append(record.getvalue())
            nrecords += 1
            offset += RECORD_SIZE
-            data = text.read(RECORD_SIZE)
+            data = self._read_text_record(text)
        self._text_nrecords = nrecords

    def _rescale_image(self, data, maxsizeb, dimen=None):
@ -304,8 +334,8 @@ class MobiWriter(object):
        metadata = self._oeb.metadata
        exth = self._build_exth()
        record0 = StringIO()
-        record0.write(pack('>HHIHHHH', self._compress, 0, self._text_length,
-            self._text_nrecords, RECORD_SIZE, 0, 0))
+        record0.write(pack('>HHIHHHH', self._compression, 0,
+            self._text_length, self._text_nrecords, RECORD_SIZE, 0, 0))
        uid = random.randint(0, 0xffffffff)
        title = str(metadata.title[0])
        record0.write('MOBI')
@ -320,7 +350,11 @@ class MobiWriter(object):
        record0.write(pack('>I', 0x50))
        record0.write('\0' * 32)
        record0.write(pack('>IIII', 0xffffffff, 0xffffffff, 0, 0))
-        # TODO: What the hell are these fields?
+        # The '5' is a bitmask of extra record data at the end:
+        #   - 0x1: <extra multibyte bytes><size> (?)
+        #   - 0x4: <uncrossable breaks><size>
+        # Of course, the formats aren't quite the same.
+        # TODO: What the hell are the rest of these fields?
        record0.write(pack('>IIIIIIIIIIIIIIIII',
            0, 0, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff,
            0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 5, 0xffffffff))
@ -391,9 +425,10 @@ class MobiWriter(object):
 def main(argv=sys.argv):
    from calibre.ebooks.oeb.base import DirWriter
    inpath, outpath = argv[1:]
-    context = Context('MSReader', 'Cybook3')
+    context = Context('MSReader', 'MobiDesktop')
    oeb = OEBBook(inpath)
-    writer = MobiWriter(compress=PALMDOC)
+    #writer = MobiWriter(compression=PALMDOC)
+    writer = MobiWriter(compression=UNCOMPRESSED)
    #writer = DirWriter()
    fbase = context.dest.fbase
    fkey = context.dest.fnums.values()
--- a/src/calibre/ebooks/oeb/profile.py
+++ b/src/calibre/ebooks/oeb/profile.py
@ -39,6 +39,11 @@ PROFILES = {
        Profile(width=480, height=652, dpi=100.0, fbase=13,
                fsizes=[10, 11, 13, 16, 18, 20, 22, 26]),

+    # Not really, but let's pretend
+    'MobiDesktop':
+        Profile(width=340, height=400, dpi=100, fbase=12,
+                fsizes=[9, 10, 11, 12, 14, 17, 20, 24]),
+    
    # No clue on usable screen size and DPI
    'Cybook3':
        Profile(width=584, height=754, dpi=168.451, fbase=12,
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@ -170,6 +170,9 @@ class CSSFlattener(object):
                left -= style['text-indent']
            if self.unfloat and 'float' in cssdict and tag != 'img':
                del cssdict['float']
+            if 'vertical-align' in cssdict:
+                if cssdict['vertical-align'] == 'sup':
+                    cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh