Refactor inspect MOBI to use the INDX reading code from mobi.reader

2025-07-09 03:04:10 -04:00 · 2012-03-17 15:31:05 +05:30 · 2012-03-17 15:31:05 +05:30 · c87ad6d69f
commit c87ad6d69f
parent 91a4bd7d42
5 changed files with 193 additions and 337 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -15,6 +15,8 @@ from lxml import html
 from calibre.utils.date import utc_tz
 from calibre.ebooks.mobi.langcodes import main_language, sub_language
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import (parse_index_record,
        parse_tagx_section)
 from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
        get_trailing_data, decode_tbs, read_font_record)
 from calibre.utils.magick.draw import identify_data
@ -405,14 +407,10 @@ class MOBIHeader(object): # {{{
 class TagX(object): # {{{
-    def __init__(self, raw):
+    def __init__(self, tag, num_values, bitmask, eof):
-        self.tag = ord(raw[0])
+        self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values,
-        self.num_values = ord(raw[1])
+                bitmask, eof)
-        self.bitmask = ord(raw[2])
+        self.num_of_values = num_values
        # End of file = 1 iff last entry
        # When it is 1 all others are 0
        self.eof = ord(raw[3])
        self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
                and self.bitmask == 0)
@ -459,13 +457,7 @@ class SecondaryIndexHeader(object): # {{{
            raise ValueError('Invalid TAGX section')
        self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
        self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
-        tag_table = tagx[12:self.tagx_header_length]
+        self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
        if len(tag_table) % 4 != 0:
            raise ValueError('Invalid Tag table')
        num_tagx_entries = len(tag_table) // 4
        self.tagx_entries = []
        for i in range(num_tagx_entries):
            self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
        if self.tagx_entries and not self.tagx_entries[-1].is_eof:
            raise ValueError('TAGX last entry is not EOF')
@ -533,7 +525,8 @@ class IndexHeader(object): # {{{
            raise ValueError('Invalid Primary Index Record')
        self.header_length, = struct.unpack('>I', raw[4:8])
-        self.unknown1 = raw[8:16]
+        self.unknown1 = raw[8:12]
        self.header_type, = struct.unpack('>I', raw[12:16])
        self.index_type, = struct.unpack('>I', raw[16:20])
        self.index_type_desc = {0: 'normal', 2:
                'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
@ -562,13 +555,7 @@ class IndexHeader(object): # {{{
            raise ValueError('Invalid TAGX section')
        self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
        self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
-        tag_table = tagx[12:self.tagx_header_length]
+        self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
        if len(tag_table) % 4 != 0:
            raise ValueError('Invalid Tag table')
        num_tagx_entries = len(tag_table) // 4
        self.tagx_entries = []
        for i in range(num_tagx_entries):
            self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
        if self.tagx_entries and not self.tagx_entries[-1].is_eof:
            raise ValueError('TAGX last entry is not EOF')
@ -602,6 +589,7 @@ class IndexHeader(object): # {{{
        a('Header length: %d'%self.header_length)
        u(self.unknown1)
        a('Header type: %d'%self.header_type)
        a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
        a('Offset to IDXT start: %d'%self.idxt_start)
        a('Number of index records: %d'%self.index_count)
@ -661,19 +649,15 @@ class Tag(object): # {{{
    }
-    def __init__(self, tagx, vals, entry_type, cncx):
+    def __init__(self, tag_type, vals, cncx):
        self.value = vals if len(vals) > 1 else vals[0] if vals else None
        self.entry_type = entry_type
        tag_type = tagx.tag
        self.cncx_value = None
        if tag_type in self.TAG_MAP:
            self.attr, self.desc = self.TAG_MAP[tag_type]
        else:
-            print ('Unknown tag value: %d in entry type: %s'%(tag_type,
+            print ('Unknown tag value: %%s'%tag_type)
-                entry_type))
+            self.desc = '??Unknown (tag value: %d)'%tag_type
            self.desc = '??Unknown (tag value: %d type: %s)'%(
                    tag_type, entry_type)
            self.attr = 'unknown'
        if '_offset' in self.attr:
@ -695,50 +679,13 @@ class IndexEntry(object): # {{{
    used in the navigation UI.
    '''
-    def __init__(self, ident, entry_type, raw, cncx, tagx_entries,
+    def __init__(self, ident, entry, cncx):
-            control_byte_count):
+        try:
            self.index = int(ident, 16)
        except ValueError:
            self.index = ident
-        self.raw = raw
+        self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in
-        self.tags = []
+                entry.iteritems()]
        self.entry_type = entry_type
        self.byte_size = len(raw)
        orig_raw = raw
        if control_byte_count not in (1, 2):
            raise ValueError('Unknown control byte count: %d'%
                    control_byte_count)
        self.flags = 0
        if control_byte_count == 2:
            self.flags = ord(raw[0])
            raw = raw[1:]
        expected_tags = [tag for tag in tagx_entries if tag.bitmask &
                entry_type]
        flags = self.flags
        for tag in expected_tags:
            vals = []
            if tag.tag > 0b1000000: # 0b1000000 = 64
                has_tag = flags & 0b1
                flags = flags >> 1
                if not has_tag: continue
            for i in range(tag.num_values):
                if not raw:
                    raise ValueError('Index entry does not match TAGX header')
                val, consumed = decint(raw)
                raw = raw[consumed:]
                vals.append(val)
            self.tags.append(Tag(tag, vals, self.entry_type, cncx))
        self.consumed = len(orig_raw) - len(raw)
        self.trailing_bytes = raw
        if self.trailing_bytes.replace(b'\0', b''):
            raise ValueError('%s has leftover bytes: %s'%(self, format_bytes(
                self.trailing_bytes)))
    @property
    def label(self):
@ -797,102 +744,14 @@ class IndexEntry(object): # {{{
        return [0, 0]
    def __str__(self):
-        ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, '
+        ans = ['Index Entry(index=%s, length=%d)'%(
-                'length=%d, byte_size=%d)'%(
+            self.index, len(self.tags))]
            self.index, bin(self.entry_type), bin(self.flags)[2:],
            len(self.tags), self.byte_size)]
        for tag in self.tags:
            if tag.value is not None:
                ans.append('\t'+str(tag))
        if self.first_child_index != -1:
            ans.append('\tNumber of children: %d'%(self.last_child_index -
                self.first_child_index + 1))
        if self.trailing_bytes:
            ans.append('\tTrailing bytes: %r'%self.trailing_bytes)
        return '\n'.join(ans)
 # }}}
 class SecondaryIndexRecord(object): # {{{
    def __init__(self, record, index_header, cncx):
        self.record = record
        raw = self.record.raw
        if raw[:4] != b'INDX':
            raise ValueError('Invalid Primary Index Record')
        u = struct.unpack
        self.header_length, = u('>I', raw[4:8])
        self.unknown1 = raw[8:12]
        self.header_type, = u('>I', raw[12:16])
        self.unknown2 = raw[16:20]
        self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
        if self.idxt_offset < 192:
            raise ValueError('Unknown Index record structure')
        self.unknown3 = raw[28:36]
        self.unknown4 = raw[36:192] # Should be 156 bytes
        self.index_offsets = []
        indices = raw[self.idxt_offset:]
        if indices[:4] != b'IDXT':
            raise ValueError("Invalid IDXT index table")
        indices = indices[4:]
        for i in range(self.idxt_count):
            off, = u(b'>H', indices[i*2:(i+1)*2])
            self.index_offsets.append(off-192)
        rest = indices[(i+1)*2:]
        if rest.replace(b'\0', ''): # There can be padding null bytes
            raise ValueError('Extra bytes after IDXT table: %r'%rest)
        indxt = raw[192:self.idxt_offset]
        self.size_of_indxt_block = len(indxt)
        self.indices = []
        for i, off in enumerate(self.index_offsets):
            try:
                next_off = self.index_offsets[i+1]
            except:
                next_off = len(indxt)
            num = ord(indxt[off])
            index = indxt[off+1:off+1+num]
            consumed = 1 + num
            entry_type = ord(indxt[off+consumed])
            pos = off+consumed+1
            idxe = IndexEntry(index, entry_type,
                    indxt[pos:next_off], cncx,
                    index_header.tagx_entries,
                    index_header.tagx_control_byte_count)
            self.indices.append(idxe)
        rest = indxt[pos+self.indices[-1].consumed:]
        if rest.replace(b'\0', b''): # There can be padding null bytes
            raise ValueError('Extra bytes after IDXT table: %r'%rest)
    def __str__(self):
        ans = ['*'*20 + ' Secondary Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
        a = ans.append
        def u(w):
            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
                len(w), not bool(w.replace(b'\0', b'')) ))
        a('Header length: %d'%self.header_length)
        u(self.unknown1)
        a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
        u(self.unknown2)
        a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
            self.idxt_offset))
        a('IDXT Count: %d'%self.idxt_count)
        u(self.unknown3)
        u(self.unknown4)
        a('Index offsets: %r'%self.index_offsets)
        a('\nIndex Entries (%d entries):'%len(self.indices))
        for entry in self.indices:
            a(str(entry))
            a('')
        return '\n'.join(ans)
 # }}}
@ -904,58 +763,25 @@ class IndexRecord(object): # {{{
    in the trailing data of the text records.
    '''
-    def __init__(self, record, index_header, cncx):
+    def __init__(self, records, index_header, cncx):
        self.record = record
        self.alltext = None
-        raw = self.record.raw
+        table = OrderedDict()
        tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in
                index_header.tagx_entries]
        for record in records:
            raw = record.raw
            if raw[:4] != b'INDX':
                raise ValueError('Invalid Primary Index Record')
-        u = struct.unpack
+            parse_index_record(table, record.raw,
                    index_header.tagx_control_byte_count, tags,
                    index_header.index_encoding, strict=True)
        self.header_length, = u('>I', raw[4:8])
        self.unknown1 = raw[8:12]
        self.header_type, = u('>I', raw[12:16])
        self.unknown2 = raw[16:20]
        self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
        if self.idxt_offset < 192:
            raise ValueError('Unknown Index record structure')
        self.unknown3 = raw[28:36]
        self.unknown4 = raw[36:192] # Should be 156 bytes
        self.index_offsets = []
        indices = raw[self.idxt_offset:]
        if indices[:4] != b'IDXT':
            raise ValueError("Invalid IDXT index table")
        indices = indices[4:]
        for i in range(self.idxt_count):
            off, = u(b'>H', indices[i*2:(i+1)*2])
            self.index_offsets.append(off-192)
        rest = indices[(i+1)*2:]
        if rest.replace(b'\0', ''): # There can be padding null bytes
            raise ValueError('Extra bytes after IDXT table: %r'%rest)
        indxt = raw[192:self.idxt_offset]
        self.size_of_indxt_block = len(indxt)
        self.indices = []
        for i, off in enumerate(self.index_offsets):
            try:
                next_off = self.index_offsets[i+1]
            except:
                next_off = len(indxt)
            index, consumed = decode_hex_number(indxt[off:])
            entry_type = ord(indxt[off+consumed])
            pos = off+consumed+1
            idxe = IndexEntry(index, entry_type,
                    indxt[pos:next_off], cncx,
                    index_header.tagx_entries,
                    index_header.tagx_control_byte_count)
            self.indices.append(idxe)
-        rest = indxt[pos+self.indices[-1].consumed:]
+        for ident, entry in table.iteritems():
-        if rest.replace(b'\0', b''): # There can be padding null bytes
+            self.indices.append(IndexEntry(ident, entry, cncx))
            raise ValueError('Extra bytes after IDXT table: %r'%rest)
    def get_parent(self, index):
        if index.depth < 1:
@ -965,24 +791,12 @@ class IndexRecord(object): # {{{
            if p.depth != parent_depth:
                continue
    def __str__(self):
-        ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
+        ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20]
        a = ans.append
        def u(w):
            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
                len(w), not bool(w.replace(b'\0', b'')) ))
        a('Header length: %d'%self.header_length)
        u(self.unknown1)
        a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
        u(self.unknown2)
        a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
            self.idxt_offset))
        a('IDXT Count: %d'%self.idxt_count)
        u(self.unknown3)
        u(self.unknown4)
        a('Index offsets: %r'%self.index_offsets)
        a('\nIndex Entries (%d entries):'%len(self.indices))
        for entry in self.indices:
            offset = entry.offset
            a(str(entry))
@ -1157,7 +971,7 @@ class TBSIndexing(object): # {{{
    def get_index(self, idx):
        for i in self.indices:
-            if i.index == idx: return i
+            if i.index in {idx, unicode(idx)}: return i
        raise IndexError('Index %d not found'%idx)
    def __str__(self):
@ -1190,7 +1004,7 @@ class TBSIndexing(object): # {{{
            if entries:
                ans.append('\t%s:'%typ)
                for x in entries:
-                    ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
+                    ans.append(('\t\tIndex Entry: %s (Parent index: %s, '
                            'Depth: %d, Offset: %d, Size: %d) [%s]')%(
                        x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
        def bin4(num):
@ -1287,18 +1101,18 @@ class TBSIndexing(object): # {{{
                        ' when reading starting section'%extra)
            si = self.get_index(si)
            ans.append('The section at the start of this record is:'
-                    ' %d'%si.index)
+                    ' %s'%si.index)
            if 0b0100 in extra:
                num = extra[0b0100]
                ans.append('The number of articles from the section %d'
-                        ' in this record: %d'%(si.index, num))
+                        ' in this record: %s'%(si.index, num))
            elif 0b0001 in extra:
                eof = extra[0b0001]
                if eof != 0:
                    raise ValueError('Unknown eof value %s when reading'
                            ' starting section. All bytes: %r'%(eof, orig))
                ans.append('??This record has more than one article from '
-                        ' the section: %d'%si.index)
+                        ' the section: %s'%si.index)
            return si, byts
        # }}}
@ -1362,21 +1176,23 @@ class MOBIFile(object): # {{{
        pir = self.mobi_header.primary_index_record
        if pir != NULL_INDEX:
            self.index_header = IndexHeader(self.records[pir])
            numi = self.index_header.index_count
            self.cncx = CNCX(self.records[
-                pir+2:pir+2+self.index_header.num_of_cncx_blocks],
+                pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks],
                self.index_header.index_encoding)
-            self.index_record = IndexRecord(self.records[pir+1],
+            self.index_record = IndexRecord(self.records[pir+1:pir+1+numi],
                    self.index_header, self.cncx)
            self.indexing_record_nums = set(xrange(pir,
-                pir+2+self.index_header.num_of_cncx_blocks))
+                pir+1+numi+self.index_header.num_of_cncx_blocks))
        self.secondary_index_record = self.secondary_index_header = None
        sir = self.mobi_header.secondary_index_record
        if sir != NULL_INDEX:
            self.secondary_index_header = SecondaryIndexHeader(self.records[sir])
            numi = self.secondary_index_header.index_count
            self.indexing_record_nums.add(sir)
-            self.secondary_index_record = SecondaryIndexRecord(
+            self.secondary_index_record = IndexRecord(
-                    self.records[sir+1], self.secondary_index_header, self.cncx)
+                    self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx)
-            self.indexing_record_nums.add(sir+1)
+            self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi))
        ntr = self.mobi_header.number_of_text_records
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import struct
-from collections import OrderedDict
+from collections import OrderedDict, namedtuple
-from calibre.ebooks.mobi.utils import decint, count_set_bits
+from calibre.ebooks.mobi.utils import (decint, count_set_bits,
        decode_string)
 TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
 PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
 class InvalidFile(ValueError):
    pass
@ -37,9 +41,8 @@ def parse_indx_header(data):
            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
    )
    num = len(words)
-    values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
+    values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
-    header = {words[i]:values[i] for i in xrange(num)}
+    return dict(zip(words, values))
    return header
 class CNCX(object): # {{{
@ -77,101 +80,94 @@ class CNCX(object): # {{{
        return self.records.get(offset, default)
 # }}}
-def parse_tag_section(data):
+def parse_tagx_section(data):
    check_signature(data, b'TAGX')
    tags = []
-    first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
+    first_entry_offset, = struct.unpack_from(b'>L', data, 4)
-    control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
+    control_byte_count, = struct.unpack_from(b'>L', data, 8)
    # Skip the first 12 bytes already read above.
    for i in xrange(12, first_entry_offset, 4):
-        pos = i
+        vals = list(bytearray(data[i:i+4]))
-        tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
+        tags.append(TagX(*vals))
            ord(data[pos+3])))
    return control_byte_count, tags
-def get_tag_map(control_byte_count, tags, data, start, end):
+def get_tag_map(control_byte_count, tagx, data, strict=False):
    ptags = []
    ans = {}
-    control_byte_index = 0
+    control_bytes = list(bytearray(data[:control_byte_count]))
-    data_start = start + control_byte_count
+    data = data[control_byte_count:]
-    for tag, values_per_entry, mask, end_flag in tags:
+    for x in tagx:
-        if end_flag == 0x01:
+        if x.eof == 0x01:
-            control_byte_index += 1
+            control_bytes = control_bytes[1:]
            continue
-        value = ord(data[start + control_byte_index]) & mask
+        value = control_bytes[0] & x.bitmask
        if value != 0:
-            if value == mask:
+            value_count = value_bytes = None
-                if count_set_bits(mask) > 1:
+            if value == x.bitmask:
                if count_set_bits(x.bitmask) > 1:
                    # If all bits of masked value are set and the mask has more
                    # than one bit, a variable width value will follow after
                    # the control bytes which defines the length of bytes (NOT
                    # the value count!) which will contain the corresponding
                    # variable width values.
-                    value, consumed = decint(data[data_start:])
+                    value_bytes, consumed = decint(data)
-                    data_start += consumed
+                    data = data[consumed:]
                    ptags.append((tag, None, value, values_per_entry))
                else:
-                    ptags.append((tag, 1, None, values_per_entry))
+                    value_count = 1
            else:
                # Shift bits to get the masked value.
-                while mask & 0x01 == 0:
+                mask = x.bitmask
-                    mask = mask >> 1
+                while mask & 0b1 == 0:
-                    value = value >> 1
+                    mask >>= 1
-                ptags.append((tag, value, None, values_per_entry))
+                    value >>= 1
-    for tag, value_count, value_bytes, values_per_entry in ptags:
+                value_count = value
            ptags.append(PTagX(x.tag, value_count, value_bytes,
                x.num_of_values))
    for x in ptags:
        values = []
-        if value_count != None:
+        if x.value_count is not None:
            # Read value_count * values_per_entry variable width values.
-            for _ in xrange(value_count*values_per_entry):
+            for _ in xrange(x.value_count * x.num_of_values):
-                byts, consumed = decint(data[data_start:])
+                byts, consumed = decint(data)
-                data_start += consumed
+                data = data[consumed:]
                values.append(byts)
-        else:
+        else: # value_bytes is not None
            # Convert value_bytes to variable width values.
            total_consumed = 0
-            while total_consumed < value_bytes:
+            while total_consumed < x.value_bytes:
                # Does this work for values_per_entry != 1?
-                byts, consumed = decint(data[data_start:])
+                byts, consumed = decint(data)
-                data_start += consumed
+                data = data[consumed:]
                total_consumed += consumed
                values.append(byts)
-            if total_consumed != value_bytes:
+            if total_consumed != x.value_bytes:
-                print ("Error: Should consume %s bytes, but consumed %s" %
+                err = ("Error: Should consume %s bytes, but consumed %s" %
-                        (value_bytes, total_consumed))
+                        (x.value_bytes, total_consumed))
-        ans[tag] = values
+                if strict:
-    # Test that all bytes have been processed if end is given.
+                    raise ValueError(err)
-    if end is not None and data_start < end:
+                else:
-        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
+                    print(err)
-        rest = data[data_start:end]
+        ans[x.tag] = values
-        if rest.replace(b'\0', b''):
+    # Test that all bytes have been processed
-            print ("Warning: There are unprocessed index bytes left: %s" %
+    if data.replace(b'\0', b''):
-                    format_bytes(rest))
+        err = ("Warning: There are unprocessed index bytes left: %s" %
                format_bytes(data))
        if strict:
            raise ValueError(err)
        else:
            print(err)
    return ans
-def read_index(sections, idx, codec):
+def parse_index_record(table, data, control_byte_count, tags, codec,
-    table, cncx = OrderedDict(), CNCX([], codec)
+        strict=False):
    data = sections[idx][0]
    indx_header = parse_indx_header(data)
    indx_count = indx_header['count']
    if indx_header['ncncx'] > 0:
        off = idx + indx_count + 1
        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
        cncx = CNCX(cncx_records, codec)
    tag_section_start = indx_header['len']
    control_byte_count, tags = parse_tag_section(data[tag_section_start:])
    for i in xrange(idx + 1, idx + 1 + indx_count):
        data = sections[i][0]
    header = parse_indx_header(data)
    idxt_pos = header['start']
    if data[idxt_pos:idxt_pos+4] != b'IDXT':
        print ('WARNING: Invalid INDX record')
    entry_count = header['count']
    # loop through to build up the IDXT position starts
@ -187,11 +183,32 @@ def read_index(sections, idx, codec):
    # text
    for j in xrange(entry_count):
        start, end = idx_positions[j:j+2]
-            text_length = ord(data[start])
+        rec = data[start:end]
-            text = data[start+1:start+1+text_length]
+        ident, consumed = decode_string(rec, codec=codec)
-            tag_map = get_tag_map(control_byte_count, tags, data,
+        rec = rec[consumed:]
-                    start+1+text_length, end)
+        tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
-            table[text] = tag_map
+        table[ident] = tag_map
 def read_index(sections, idx, codec):
    table, cncx = OrderedDict(), CNCX([], codec)
    data = sections[idx][0]
    indx_header = parse_indx_header(data)
    indx_count = indx_header['count']
    if indx_header['ncncx'] > 0:
        off = idx + indx_count + 1
        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
        cncx = CNCX(cncx_records, codec)
    tag_section_start = indx_header['len']
    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
    for i in xrange(idx + 1, idx + 1 + indx_count):
        # Index record
        data = sections[i][0]
        parse_index_record(table, data, control_byte_count, tags, codec)
    return table, cncx
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -317,6 +317,7 @@ class Mobi8Reader(object):
        for entry in index_entries:
            pos = entry['pos']
            fi = self.get_file_info(pos)
            #print (11111111, fi, entry['pos_fid'])
            if fi.filename is None:
                raise ValueError('Index entry has invalid pos: %d'%pos)
            idtag = self.get_id_tag(pos).decode(self.header.codec)
--- a/src/calibre/ebooks/mobi/reader/ncx.py
+++ b/src/calibre/ebooks/mobi/reader/ncx.py
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
 import os
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.mobi.utils import to_base
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import read_index
@ -23,7 +22,30 @@ tag_fieldname_map = {
        6:  ['pos_fid',0],
        21: ['parent',0],
        22: ['child1',0],
-        23: ['childn',0]
+        23: ['childn',0],
        69: ['image_index',0],
        70 : ['desc_offset', 0], # 'Description offset in cncx'
        71 : ['author_offset', 0], # 'Author offset in cncx'
        72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
        73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
 }
 default_entry = {
                    'pos':  -1,
                    'len':  0,
                    'noffs': -1,
                    'text' : "Unknown Text",
                    'hlvl' : -1,
                    'kind' : "Unknown Class",
                    'pos_fid' : None,
                    'parent' : -1,
                    'child1' : -1,
                    'childn' : -1,
                    'description': None,
                    'author': None,
                    'image_caption': None,
                    'image_attribution': None,
 }
 def read_ncx(sections, index, codec):
@ -34,32 +56,25 @@ def read_ncx(sections, index, codec):
        for num, x in enumerate(table.iteritems()):
            text, tag_map = x
-            entry = {
+            entry = default_entry.copy()
-                    'name': text,
+            entry['name'] = text
-                    'pos':  -1,
+            entry['num'] = num
                    'len':  0,
                    'noffs': -1,
                    'text' : "Unknown Text",
                    'hlvl' : -1,
                    'kind' : "Unknown Kind",
                    'pos_fid' : None,
                    'parent' : -1,
                    'child1' : -1,
                    'childn' : -1,
                    'num'  : num
            }
-            for tag in tag_fieldname_map.keys():
+            for tag in tag_fieldname_map.iterkeys():
                fieldname, i = tag_fieldname_map[tag]
                if tag in tag_map:
                    fieldvalue = tag_map[tag][i]
                    if tag == 6:
-                        fieldvalue = to_base(fieldvalue, base=32)
+                        # Appears to be an idx into the KF8 elems table with an
                        # offset
                        fieldvalue = tuple(tag_map[tag])
                    entry[fieldname] = fieldvalue
-                    if tag == 3:
+                    for which, name in {3:'text', 5:'kind', 70:'description',
-                        entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
+                            71:'author', 72:'image_caption',
-                    if tag == 5:
+                            73:'image_attribution'}.iteritems():
-                        entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
+                        if tag == which:
                            entry[name] = cncx.get(fieldvalue,
                                    default_entry[name])
            index_entries.append(entry)
    return index_entries
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -15,7 +15,13 @@ from calibre.ebooks import normalize
 IMAGE_MAX_SIZE = 10 * 1024 * 1024
-def decode_hex_number(raw):
+def decode_string(raw, codec='utf-8'):
    length, = struct.unpack(b'>B', raw[0])
    raw = raw[1:1+length]
    consumed = length+1
    return raw.decode(codec), consumed
 def decode_hex_number(raw, codec='utf-8'):
    '''
    Return a variable length number encoded using hexadecimal encoding. These
    numbers have the first byte which tells the number of bytes that follow.
@ -25,13 +31,16 @@ def decode_hex_number(raw):
    :param raw: Raw binary data as a bytestring
    :return: The number and the number of bytes from raw that the number
-    occupies
+    occupies.
    '''
-    length, = struct.unpack(b'>B', raw[0])
+    raw, consumed = decode_string(raw, codec=codec)
    raw = raw[1:1+length]
    consumed = length+1
    return int(raw, 16), consumed
 def encode_string(raw):
    ans = bytearray(bytes(raw))
    ans.insert(0, len(ans))
    return bytes(ans)
 def encode_number_as_hex(num):
    '''
    Encode num as a variable length encoded hexadecimal number. Returns the
@ -44,9 +53,7 @@ def encode_number_as_hex(num):
    nlen = len(num)
    if nlen % 2 != 0:
        num = b'0'+num
-    ans = bytearray(num)
+    return encode_string(num)
    ans.insert(0, len(num))
    return bytes(ans)
 def encint(value, forward=True):
    '''