Mobi debug: Figured out the TAGX table, use it to properly decode the index entries

2025-07-09 03:04:10 -04:00 · 2011-07-18 22:38:04 -06:00 · 2011-07-18 22:38:04 -06:00 · 55987fa6cb
commit 55987fa6cb
parent 08dff7d722
1 changed files with 105 additions and 84 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -377,18 +377,17 @@ class TagX(object): # {{{
    def __init__(self, raw, control_byte_count):
        self.tag = ord(raw[0])
        self.num_values = ord(raw[1])
-        self.bmask = ord(raw[2])
+        self.bitmask = ord(raw[2])
        self.bitmask = bin(self.bmask)
        # End of file = 1 iff last entry
        # When it is 1 all others are 0
        self.eof = ord(raw[3])
        self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
-                and self.bmask == 0)
+                and self.bitmask == 0)
    def __repr__(self):
-        return 'TAGX(tag=%02d, num_values=%d, bitmask=%r (%d), eof=%d)' % (self.tag,
+        return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag,
-                self.num_values, self.bitmask, self.bmask, self.eof)
+                self.num_values, bin(self.bitmask), self.eof)
    # }}}
 class IndexHeader(object): # {{{
@ -444,6 +443,7 @@ class IndexHeader(object): # {{{
                self.tagx_control_byte_count))
        if self.tagx_entries and not self.tagx_entries[-1].is_eof:
            raise ValueError('TAGX last entry is not EOF')
        self.tagx_entries = self.tagx_entries[:-1]
        idxt0_pos = self.header_length+self.tagx_header_length
        last_num, consumed = decode_hex_number(raw[idxt0_pos:])
@ -497,6 +497,81 @@ class IndexHeader(object): # {{{
        return '\n'.join(ans)
    # }}}
 class Tag(object): # {{{
    '''
    Index entries are a collection of tags. Each tag is represented by this
    class.
    '''
    TAG_MAP = {
            1: ('offset', 'Offset in HTML'),
            2: ('size', 'Size in HTML'),
            3: ('label_offset', 'Offset to label in CNCX'),
            4: ('depth', 'Depth of this entry in TOC'),
            # The remaining tag types have to be interpreted subject to the type
            # of index entry they are present in
    }
    INTERPRET_MAP = {
            'subchapter': {
                    5  : ('Parent chapter index', 'parent_index')
            },
            'article'   : {
                    5  : ('Class offset in CTOC', 'class_offset'),
                    21 : ('Parent section index', 'parent_index'),
                    22 : ('Description offset in CTOC', 'desc_offset'),
                    23 : ('Author offset in CTOC', 'author_offset'),
            },
            'chapter_with_subchapters' : {
                    22 : ('First subchapter index', 'first_subchapter_index'),
                    23 : ('Last subchapter index', 'last_subchapter_index'),
            },
            'periodical' : {
                    5  : ('Class offset in CTOC', 'class_offset'),
                    22 : ('First section index', 'first_section_index'),
                    23 : ('Last section index', 'last_section_index'),
            },
            'section' : {
                    5  : ('Class offset in CTOC', 'class_offset'),
                    21 : ('Periodical index', 'periodical_index'),
                    22 : ('First article index', 'first_article_index'),
                    23 : ('Last article index', 'last_article_index'),
            },
    }
    def __init__(self, tagx, vals, entry_type, ctoc):
        self.value = vals if len(vals) > 1 else vals[0]
        self.entry_type = entry_type
        self.ctoc_value = None
        if tagx.tag in self.TAG_MAP:
            self.attr, self.desc = self.TAG_MAP[tagx.tag]
        else:
            try:
                td = self.INTERPRET_MAP[entry_type]
            except:
                raise ValueError('Unknown entry type: %s'%entry_type)
            try:
                self.desc, self.attr = td[tagx.tag]
            except:
                raise ValueError('Unknown tag: %d for entry type: %s'%(
                    tagx.tag, entry_type))
        if '_offset' in self.attr:
            self.ctoc_value = ctoc[self.value]
    def __str__(self):
        if self.ctoc_value is not None:
            return '%s : %r [%r]'%(self.desc, self.value, self.ctoc_value)
        return '%s : %r'%(self.desc, self.value)
 # }}}
 class IndexEntry(object): # {{{
    TYPES = {
@ -510,97 +585,41 @@ class IndexEntry(object): # {{{
            0x3f : 'article',
    }
-    def __init__(self, ident, entry_type, raw, is_last):
+    def __init__(self, ident, entry_type, raw, ctoc, tagx_entries):
        self.index = ident
        self.fields = []
        self.sub_type = None
        self.raw = raw
        self.tags = []
        try:
            self.entry_type = self.TYPES[entry_type]
        except KeyError:
            raise ValueError('Unknown Index Entry type: %s'%hex(entry_type))
-        if self.entry_type in (0xdf, 0xff):
+        expected_tags = [tag for tag in tagx_entries if tag.bitmask &
-            self.subtype = ord(raw[0])
+                entry_type]
            raw = raw[1:]
        while raw:
            val, consumed = decint(raw)
            raw = raw[consumed:]
            self.fields.append(val)
        if is_last and self.fields[-1] == 0:
            self.fields = self.fields[:-1]
-        self.interpret()
+        for tag in expected_tags:
-
+            vals = []
-    def interpret(self):
+            for i in range(tag.num_values):
-        self.offset = self.fields[0]
+                if not raw:
-        self.object_size = self.fields[1]
+                    raise ValueError('Index entry does not match TAGX header')
-        self.label_offset = self.fields[2]
+                val, consumed = decint(raw)
-        self.depth = self.fields[3]
+                raw = raw[consumed:]
-        self.extra = OrderedDict()
+                vals.append(val)
-        self.extra_fields = []
+            self.tags.append(Tag(tag, vals, self.entry_type, ctoc))
        if self.entry_type == 'subchapter':
            self.parent_index = self.fields[4]
            self.extra['Parent chapter index'] = 'parent_index'
            self.extra_fields = self.fields[5:]
        elif self.entry_type == 'article':
            self.class_offset = self.fields[4]
            self.extra['Class offset in CTOC'] = 'class_offset'
            self.parent_index = self.fields[5]
            self.extra['Parent section index'] = 'parent_index'
            if len(self.fields) > 6:
                self.desc_offset = self.fields[6]
                self.extra['Decription offset in CTOC'] = 'desc_offset'
            if len(self.fields) > 7:
                self.author_offset = self.fields[7]
                self.extra['Author offset in CTOC'] = 'author_offset'
            self.extra_fields = self.fields[8:]
        elif self.entry_type == 'chapter_with_subchapters':
            self.first_subchapter_index = self.fields[4]
            self.last_subchapter_index = self.fields[5]
            self.extra['First subchapter index'] = 'first_subchapter_index'
            self.extra['Last subchapter index'] = 'last_subchapter_index'
            self.extra_fields = self.fields[6:]
        elif self.entry_type == 'periodical':
            self.class_offset = self.fields[4]
            self.extra['Class offset in CTOC'] = 'class_offset'
            self.first_section_index = self.fields[5]
            self.last_section_index = self.fields[6]
            self.extra['First section index'] = 'first_section_index'
            self.extra['Last section index'] = 'last_section_index'
            self.extra_fields = self.fields[7:]
        elif self.entry_type == 'section':
            self.class_offset = self.fields[4]
            self.extra['Class offset in CTOC'] = 'class_offset'
            self.periodical_index = self.fields[5]
            self.extra['Periodical index'] = 'periodical_index'
            self.first_article_index = self.fields[6]
            self.last_article_index = self.fields[7]
            self.extra['First article index'] = 'first_article_index'
            self.extra['Last article index'] = 'last_article_index'
            self.extra_fields = self.fields[8:]
    def __str__(self):
-        ans = ['Index Entry(index=%s, entry_type=%s, sub_type=%s, length=%d)'%(
+        ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
-            self.index, self.entry_type, self.sub_type, len(self.raw))]
+            self.index, self.entry_type, len(self.tags))]
-        ans.append('\tOffset in HTML: %d'%self.offset)
+        for tag in self.tags:
-        ans.append('\tObject size in HTML: %d'%self.object_size)
+            ans.append('\t'+str(tag))
        ans.append('\tLabel offset in CTOC: %d'%self.label_offset)
        ans.append('\tDepth: %d'%self.depth)
        for text, attr in self.extra.iteritems():
            ans.append('\t%s: %d'%(text, getattr(self, attr)))
        if self.extra_fields:
            ans.append('\tExtra Fields (%d): %r'%(len(self.extra_fields),
                self.extra_fields))
        return '\n'.join(ans)
 # }}}
 class IndexRecord(object): # {{{
-    def __init__(self, record):
+    def __init__(self, record, index_header, ctoc):
        self.record = record
        raw = self.record.raw
        if raw[:4] != b'INDX':
@ -632,14 +651,12 @@ class IndexRecord(object): # {{{
        for i, off in enumerate(self.index_offsets):
            try:
                next_off = self.index_offsets[i+1]
                is_last = False
            except:
                next_off = len(indxt)
                is_last = True
            index, consumed = decode_hex_number(indxt[off:])
-            entry_type, = u(b'>B', indxt[off+consumed])
+            entry_type = ord(indxt[off+consumed])
            self.indices.append(IndexEntry(index, entry_type,
-                indxt[off+consumed+1:next_off], is_last))
+                indxt[off+consumed+1:next_off], ctoc, index_header.tagx_entries))
    def __str__(self):
@ -679,6 +696,9 @@ class CTOC(object) : # {{{
                        codec)
                pos += consumed+length
    def __getitem__(self, offset):
        return self.records.get(offset)
    def __str__(self):
        ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20]
        for k, v in self.records.iteritems():
@ -723,7 +743,8 @@ class MOBIFile(object): # {{{
            self.ctoc = CTOC(self.records[
                pir+2:pir+2+self.index_header.num_of_ctoc_blocks],
                self.index_header.index_encoding)
-            self.index_record = IndexRecord(self.records[pir+1])
+            self.index_record = IndexRecord(self.records[pir+1],
                    self.index_header, self.ctoc)
    def print_header(self, f=sys.stdout):