Mobi debug: Dump KF8 SKEL and SECT indices

2025-07-09 03:04:10 -04:00 · 2012-04-11 15:44:40 +05:30 · 2012-04-11 15:44:40 +05:30 · 94ff0c64d5
commit 94ff0c64d5
parent 5fd415ea2d
3 changed files with 146 additions and 0 deletions
--- a/src/calibre/ebooks/mobi/debug/index.py
+++ b/src/calibre/ebooks/mobi/debug/index.py
@ -0,0 +1,126 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import OrderedDict, namedtuple
+
+from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header,
+        parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS)
+
+File = namedtuple('File',
+    'file_number name divtbl_count start_position length')
+
+Elem = namedtuple('Elem',
+    'insert_pos toc_text file_number sequence_number start_pos '
+    'length')
+
+def read_index(sections, idx, codec):
+    table, cncx = OrderedDict(), CNCX([], codec)
+
+    data = sections[idx].raw
+
+    indx_header = parse_indx_header(data)
+    indx_count = indx_header['count']
+
+    if indx_header['ncncx'] > 0:
+        off = idx + indx_count + 1
+        cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]]
+        cncx = CNCX(cncx_records, codec)
+
+    tag_section_start = indx_header['tagx']
+    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
+
+    for i in xrange(idx + 1, idx + 1 + indx_count):
+        # Index record
+        data = sections[i].raw
+        parse_index_record(table, data, control_byte_count, tags, codec,
+                indx_header['ordt_map'], strict=True)
+    return table, cncx, indx_header
+
+class Index(object):
+
+    def __init__(self, idx, records, codec):
+        self.table = self.cncx = self.header = self.records = None
+        if idx != NULL_INDEX:
+            self.table, self.cncx, self.header = read_index(records, idx, codec)
+
+    def render(self):
+        ans = ['*'*10 + ' Index Header ' + '*'*10]
+        a = ans.append
+        if self.header is not None:
+            for field in INDEX_HEADER_FIELDS:
+                a('%-12s: %r'%(field, self.header[field]))
+        ans.extend(['', ''])
+
+        if self.cncx:
+            a('*'*10 + ' CNCX ' + '*'*10)
+            for offset, val in self.cncx.iteritems():
+                a('%10s: %s'%(offset, val))
+            ans.extend(['', ''])
+
+        if self.table is not None:
+            a('*'*10 + ' %d Index Entries '%len(self.table) + '*'*10)
+            for k, v in self.table.iteritems():
+                a('%s: %r'%(k, v))
+
+        if self.records:
+            ans.extend(['', '', '*'*10 + ' Parsed Entries ' + '*'*10])
+            for f in self.records:
+                a(repr(f))
+
+        return ans + ['']
+
+    def __str__(self):
+        return '\n'.join(self.render())
+
+class SKELIndex(Index):
+
+    def __init__(self, skelidx, records, codec):
+        super(SKELIndex, self).__init__(skelidx, records, codec)
+        self.records = []
+
+        if self.table is not None:
+            for i, text in enumerate(self.table.iterkeys()):
+                tag_map = self.table[text]
+                if set(tag_map.iterkeys()) != {1, 6}:
+                    raise ValueError('SKEL Index has unknown tags: %s'%
+                            (set(tag_map.iterkeys())-{1,6}))
+                self.records.append(File(
+                    i, # file_number
+                    text, # name
+                    tag_map[1][0], # divtbl_count
+                    tag_map[6][0], # start_pos
+                    tag_map[6][1]) # length
+                )
+
+class SECTIndex(Index):
+
+    def __init__(self, sectidx, records, codec):
+        super(SECTIndex, self).__init__(sectidx, records, codec)
+        self.records = []
+
+        if self.table is not None:
+             for i, text in enumerate(self.table.iterkeys()):
+                tag_map = self.table[text]
+                if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
+                    raise ValueError('SECT Index has unknown tags: %s'%
+                            (set(tag_map.iterkeys())-{2, 3, 4, 6}))
+
+                toc_text = self.cncx[tag_map[2][0]]
+                self.records.append(Elem(
+                    int(text), # insert_pos
+                    toc_text, # toc_text
+                    tag_map[3][0], # file_number
+                    tag_map[4][0], # sequence_number
+                    tag_map[6][0], # start_pos
+                    tag_map[6][1]  # length
+                    )
+                )
+
+
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@ -11,6 +11,7 @@ import sys, os, imghdr, struct
 from itertools import izip

 from calibre.ebooks.mobi.debug.headers import TextRecord
+from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex)
 from calibre.ebooks.mobi.utils import read_font_record
 from calibre.ebooks.mobi.debug import format_bytes
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
@ -65,6 +66,7 @@ class MOBIFile(object):
        self.header = self.mf.mobi8_header
        self.extract_resources()
        self.read_fdst()
+        self.read_indices()

    def print_header(self, f=sys.stdout):
        print (str(self.mf.palmdb).encode('utf-8'), file=f)
@ -85,6 +87,12 @@ class MOBIFile(object):
            if self.fdst.num_sections != self.header.fdst_count:
                raise ValueError('KF8 Header contains invalid FDST count')

+    def read_indices(self):
+        self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records,
+                self.header.encoding)
+        self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records,
+                self.header.encoding)
+
    def extract_resources(self):
        self.resource_map = []
        known_types = {b'FLIS', b'FCIS', b'SRCS',
@ -145,3 +153,9 @@ def inspect_mobi(mobi_file, ddir):
        with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
            fo.write(str(f.fdst).encode('utf-8'))

+    with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
+        fo.write(str(f.skel_index).encode('utf-8'))
+
+    with open(os.path.join(ddir, 'sect.record'), 'wb') as fo:
+        fo.write(str(f.sect_index).encode('utf-8'))
+
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -111,6 +111,12 @@ class CNCX(object): # {{{

    def get(self, offset, default=None):
        return self.records.get(offset, default)
+
+    def __bool__(self):
+        return bool(self.records)
+
+    def iteritems(self):
+        return self.records.iteritems()
 # }}}

 def parse_tagx_section(data):