KF8 Input: Add support for KF8 files with obfuscated embedded fonts

2025-08-07 09:01:38 -04:00 · 2012-03-14 12:32:13 +05:30 · 2012-03-14 12:32:13 +05:30 · a8d46f2f40
commit a8d46f2f40
parent cdfb1214e9
4 changed files with 135 additions and 56 deletions
--- a/src/calibre/ebooks/conversion/plugins/mobi_input.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py
@ -34,10 +34,12 @@ class MOBIInput(InputFormatPlugin):
                accelerators):

        if os.environ.get('USE_MOBIUNPACK', None) is not None:
+            pos = stream.tell()
            try:
                return run_mobi_unpack(stream, options, log, accelerators)
            except Exception:
                log.exception('mobi_unpack code not working')
+            stream.seek(pos)

        from calibre.ebooks.mobi.reader.mobi6 import MobiReader
        from lxml import html
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import struct, datetime, sys, os, shutil, zlib
+import struct, datetime, sys, os, shutil
 from collections import OrderedDict, defaultdict

 from lxml import html
@ -15,7 +15,7 @@ from lxml import html
 from calibre.utils.date import utc_tz
 from calibre.ebooks.mobi.langcodes import main_language, sub_language
 from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
-        get_trailing_data, decode_tbs)
+        get_trailing_data, decode_tbs, read_font_record)
 from calibre.utils.magick.draw import identify_data

 def format_bytes(byts):
@ -1154,26 +1154,13 @@ class FontRecord(object): # {{{
    def __init__(self, idx, record):
        self.raw = record.raw
        name = '%06d'%idx
-        (self.uncompressed_size, self.unknown1, self.unknown2) = \
-                struct.unpack_from(b'>LLL', self.raw, 4)
-        self.payload = self.raw[4:]
-        self.ext = 'unknown'
-        if self.unknown1 == 1:
-            self.zlib_header = self.raw[self.unknown2:self.unknown2+2]
-            self.payload = zlib.decompress(self.raw[self.unknown2+2:-4], -15)
-            hdr = self.payload[:4]
-            if hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
-                self.ext = 'ttf'
-            if self.uncompressed_size != len(self.payload):
-                raise ValueError('Font record uncompressed size mismatch',
-                        ' expected: %d actual: %d'%(self.uncompressed_size,
-                            len(self.payload)))
-        else:
-            print ('Unknown font record with fields: %s' %
-                    [self.uncompressed_size, self.unknown1, self.unknown2])
-            print ('\tAdditional fields: %s'%((
-                struct.unpack_from(b'>LL', self.raw, 16),)))
-        self.name = '%s.%s'%(name, self.ext)
+        self.font = read_font_record(self.raw)
+        if self.font['err']:
+            raise ValueError('Failed to read font record: %s Headers: %s'%(
+                self.font['err'], self.font['headers']))
+        self.payload = (self.font['font_data'] if self.font['font_data'] else
+                self.font['raw_data'])
+        self.name = '%s.%s'%(name, self.font['ext'])

    def dump(self, folder):
        with open(os.path.join(folder, self.name), 'wb') as f:
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import struct, re, os, zlib, imghdr
+import struct, re, os, imghdr
 from collections import namedtuple
 from itertools import repeat

@ -16,6 +16,7 @@ from calibre.ebooks.mobi.reader.index import read_index
 from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
 from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
 from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
+from calibre.ebooks.mobi.utils import read_font_record

 Part = namedtuple('Part',
    'num type filename start end aid')
@ -339,39 +340,16 @@ class Mobi8Reader(object):
                    b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
                pass # Ignore these records
            elif typ == b'FONT':
-                # fonts only exist in K8 ebooks
-                # Format:
-                # bytes  0 -  3:  'FONT'
-                # bytes  4 -  7:  ?? Expanded size in bytes ??
-                # bytes  8 - 11:  ?? number of files ??
-                # bytes 12 - 15:  ?? offset to start of compressed data ?? (typically 0x00000018 = 24)
-                # bytes 16 - 23:  ?? typically all 0x00 ??  Are these compression flags from zlib?
-                # The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end
-                try:
-                    fields = struct.unpack_from(b'>LLLLL', data, 4)
-                except:
-                    fields = None
-                # self.log.debug('Font record fields: %s'%(fields,))
-                cdata = data[26:-4]
-                ext = 'dat'
-                try:
-                    uncompressed_data = zlib.decompress(cdata, -15)
-                except:
-                    self.log.warn('Failed to uncompress embedded font %d: '
-                            'Fields: %s' % (fname_idx, fields,))
-                    uncompressed_data = data[4:]
-                    ext = 'failed'
-                if len(uncompressed_data) < 200:
-                    self.log.warn('Failed to uncompress embedded font %d: '
-                            'Fields: %s' % (fname_idx, fields,))
-                    uncompressed_data = data[4:]
-                    ext = 'failed'
-                hdr = uncompressed_data[:4]
-                if ext != 'failed' and hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
-                    ext = 'ttf'
-                href = "fonts/%05d.%s" % (fname_idx, ext)
+                font = read_font_record(data)
+                href = "fonts/%05d.%s" % (fname_idx, font['ext'])
+                if font['err']:
+                    self.log.warn('Reading font record %d failed: %s'%(
+                        fname_idx, font['err']))
+                    if font['headers']:
+                        self.log.debug('Font record headers: %s'%font['headers'])
                with open(href.replace('/', os.sep), 'wb') as f:
-                    f.write(uncompressed_data)
+                    f.write(font['font_data'] if font['font_data'] else
+                            font['raw_data'])
            else:
                imgtype = imghdr.what(None, data)
                if imgtype is None:
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import struct, string, imghdr
+import struct, string, imghdr, zlib
 from collections import OrderedDict

 from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
@ -373,4 +373,116 @@ def mobify_image(data):
        data = im.export('gif')
    return data

+def read_zlib_header(header):
+    header = bytearray(header)
+    # See sec 2.2 of RFC 1950 for the zlib stream format
+    # http://www.ietf.org/rfc/rfc1950.txt
+    if (header[0]*256 + header[1])%31 != 0:
+        return None, 'Bad zlib header, FCHECK failed'
+
+    cmf = header[0] & 0b1111
+    cinfo = header[0] >> 4
+    if cmf != 8:
+        return None, 'Unknown zlib compression method: %d'%cmf
+    if cinfo > 7:
+        return None, 'Invalid CINFO field in zlib header: %d'%cinfo
+    fdict = (header[1]&0b10000)>>5
+    if fdict != 0:
+        return None, 'FDICT based zlib compression not supported'
+    wbits = cinfo + 8
+    return wbits, None
+
+
+def read_font_record(data, extent=1040): # {{{
+    '''
+    Return the font encoded in the MOBI FONT record represented by data.
+    The return value in a dict with fields raw_data, font_data, err, ext,
+    headers.
+
+    :param extent: The number of obfuscated bytes. So far I have only
+    encountered files with 1040 obfuscated bytes. If you encounter an
+    obfuscated record for which this function fails, try different extent
+    values (easily automated).
+
+    raw_data is the raw data in the font record
+    font_data is the decoded font_data or None if an error occurred
+    err is not None if some error occurred
+    ext is the font type (ttf for TrueType, dat for unknown and failed if an
+    error occurred)
+    headers is the list of decoded headers from the font record or None if
+    decoding failed
+    '''
+    # Format:
+    # bytes  0 -  3:  'FONT'
+    # bytes  4 -  7:  Uncompressed size
+    # bytes  8 - 11:  flags
+    #                   bit 1 - zlib compression
+    #                   bit 2 - XOR obfuscated
+    # bytes 12 - 15:  offset to start of compressed data
+    # bytes 16 - 19:  length of XOR string
+    # bytes 19 - 23:  offset to start of XOR data
+    # The zlib compressed data begins with 2 bytes of header and
+    # has 4 bytes of checksum at the end
+    ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
+            'headers':None}
+
+    try:
+        usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
+                b'>LLLLL', data, 4)
+    except:
+        ans['err'] = 'Failed to read font record header fields'
+        return ans
+    font_data = data[dstart:]
+    ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
+            'xor_start':xor_start, 'dstart':dstart}
+
+    if flags & 0b10:
+        # De-obfuscate the data
+        key = bytearray(data[xor_start:xor_start+xor_len])
+        buf = bytearray(font_data)
+        extent = len(font_data) if extent is None else extent
+        extent = min(extent, len(font_data))
+
+        for n in xrange(extent):
+            buf[n] ^= key[n%xor_len] # XOR of buf and key
+
+        font_data = bytes(buf)
+
+    if flags & 0b1:
+        # ZLIB compressed data
+        wbits, err = read_zlib_header(font_data[:2])
+        if err is not None:
+            ans['err'] = err
+            return ans
+        adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4)
+        try:
+            # remove two bytes of zlib header and 4 bytes of trailing checksum
+            # negative wbits indicates no standard gzip header
+            font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
+        except Exception as e:
+            ans['err'] = 'Failed to zlib decompress font data (%s)'%e
+            return ans
+
+        if len(font_data) != usize:
+            ans['err'] = 'Uncompressed font size mismatch'
+            return ans
+
+        if False:
+            # For some reason these almost never match, probably Amazon has a
+            # buggy Adler32 implementation
+            sig = (zlib.adler32(font_data) & 0xffffffff)
+            if sig != adler32:
+                ans['err'] = ('Adler checksum did not match. Stored: %d '
+                        'Calculated: %d')%(adler32, sig)
+                return ans
+
+    ans['font_data'] = font_data
+    ans['ext'] = ('ttf' if font_data[:4] in {b'\0\1\0\0', b'true', b'ttcf'}
+                    else 'dat')
+
+    return ans
+# }}}
+
+
+