KF8: Fully decode FDST records

2025-07-09 03:04:10 -04:00 · 2012-04-10 18:06:20 +05:30 · 2012-04-10 18:06:20 +05:30 · 681d33416b
commit 681d33416b
parent 5017ba10ca
3 changed files with 64 additions and 19 deletions
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@ -7,10 +7,41 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import sys, os, imghdr
+import sys, os, imghdr, struct
+from itertools import izip

 from calibre.ebooks.mobi.debug.headers import TextRecord
 from calibre.ebooks.mobi.utils import read_font_record
+from calibre.ebooks.mobi.debug import format_bytes
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+
+class FDST(object):
+
+    def __init__(self, raw):
+        if raw[:4] != b'FDST':
+            raise ValueError('KF8 does not have a valid FDST record')
+        self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4)
+        if self.sec_off != 12:
+            raise ValueError('FDST record has unknown extra fields')
+        secf = b'>%dL' % (self.num_sections*2)
+        secs = struct.unpack_from(secf, raw, self.sec_off)
+        rest = raw[self.sec_off+struct.calcsize(secf):]
+        if rest:
+            raise ValueError('FDST record has trailing data: '
+                    '%s'%format_bytes(rest))
+        self.sections = tuple(izip(secs[::2], secs[1::2]))
+
+    def __str__(self):
+        ans = ['FDST record']
+        a = lambda k, v:ans.append('%s: %s'%(k, v))
+        a('Offset to sections', self.sec_off)
+        a('Number of section records', self.num_sections)
+        ans.append('**** %d Sections ****'% len(self.sections))
+        for sec in self.sections:
+            ans.append('Start: %20d End: %d'%sec)
+
+        return '\n'.join(ans)
+

 class MOBIFile(object):

@ -31,7 +62,10 @@ class MOBIFile(object):
                first_text_record+offset+h8.number_of_text_records])]

        self.raw_text = b''.join(r.raw for r in self.text_records)
+        self.header = self.mf.mobi8_header
+        self.kf8_records = mf.records[offset:]
        self.extract_resources()
+        self.read_fdst()

    def print_header(self, f=sys.stdout):
        print (str(self.mf.palmdb).encode('utf-8'), file=f)
@ -43,6 +77,15 @@ class MOBIFile(object):
        print (file=f)
        print (str(self.mf.mobi8_header).encode('utf-8'), file=f)

+    def read_fdst(self):
+        self.fdst = None
+
+        if self.header.fdst_idx != NULL_INDEX:
+            idx = self.header.fdst_idx
+            self.fdst = FDST(self.kf8_records[idx].raw)
+            if self.fdst.num_sections != self.header.fdst_count:
+                raise ValueError('KF8 Header contains invalid FDST count')
+
    def extract_resources(self):
        self.resource_map = []
        known_types = {b'FLIS', b'FCIS', b'SRCS',
@ -96,7 +139,10 @@ def inspect_mobi(mobi_file, ddir):
        rec.dump(os.path.join(ddir, 'text_records'))

    for href, payload in f.resource_map:
-        with open(os.path.join(ddir, href), 'wb') as f:
-            f.write(payload)
+        with open(os.path.join(ddir, href), 'wb') as fo:
+            fo.write(payload)

+    if f.fdst:
+        with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
+            fo.write(str(f.fdst).encode('utf-8'))

--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -15,6 +15,12 @@ from calibre.ebooks.mobi.utils import (decint, count_set_bits,

 TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
 PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
+INDEX_HEADER_FIELDS = (
+            'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
+    ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
+            'ordt1', 'ordt2', 'tagx')
+

 class InvalidFile(ValueError):
    pass
@ -36,11 +42,7 @@ def format_bytes(byts):

 def parse_indx_header(data):
    check_signature(data, b'INDX')
-    words = (
-            'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
-            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
-    ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
-            'ordt1', 'ordt2', 'tagx')
+    words = INDEX_HEADER_FIELDS
    num = len(words)
    values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
    ans = dict(zip(words, values))
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'

 import struct, re, os, imghdr
 from collections import namedtuple
-from itertools import repeat
+from itertools import repeat, izip
 from urlparse import urldefrag

 from lxml import etree
@ -71,16 +71,16 @@ class Mobi8Reader(object):
        return self.write_opf(guide, ncx, spine, resource_map)

    def read_indices(self):
-        self.flow_table = (0, NULL_INDEX)
+        self.flow_table = ()

        if self.header.fdstidx != NULL_INDEX:
            header = self.kf8_sections[self.header.fdstidx][0]
            if header[:4] != b'FDST':
                raise ValueError('KF8 does not have a valid FDST record')
-            num_sections, = struct.unpack_from(b'>L', header, 0x08)
-            sections = header[0x0c:]
-            self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
-                    sections, 0)[::2] + (NULL_INDEX,)
+            sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
+            secs = struct.unpack_from(b'>%dL' % (num_sections*2),
+                    header, sec_start)
+            self.flow_table = tuple(izip(secs[::2], secs[1::2]))

        self.files = []
        if self.header.skelidx != NULL_INDEX:
@ -127,13 +127,10 @@ class Mobi8Reader(object):
        raw_ml = self.mobi6_reader.mobi_html
        self.flows = []
        self.flowinfo = []
+        ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]

        # now split the raw_ml into its flow pieces
-        for j in xrange(0, len(self.flow_table)-1):
-            start = self.flow_table[j]
-            end = self.flow_table[j+1]
-            if end == NULL_INDEX:
-                end = len(raw_ml)
+        for start, end in ft:
            self.flows.append(raw_ml[start:end])

        # the first piece represents the xhtml text