Refactor inspect MOBI to use the INDX reading code from mobi.reader

2025-07-09 03:04:10 -04:00 · 2012-03-17 15:31:05 +05:30 · 2012-03-17 15:31:05 +05:30 · c87ad6d69f
commit c87ad6d69f
parent 91a4bd7d42
5 changed files with 193 additions and 337 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -15,6 +15,8 @@ from lxml import html
 from calibre.utils.date import utc_tz
 from calibre.ebooks.mobi.langcodes import main_language, sub_language
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.ebooks.mobi.reader.index import (parse_index_record,
+        parse_tagx_section)
 from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
        get_trailing_data, decode_tbs, read_font_record)
 from calibre.utils.magick.draw import identify_data
@ -405,14 +407,10 @@ class MOBIHeader(object): # {{{

 class TagX(object): # {{{

-    def __init__(self, raw):
-        self.tag = ord(raw[0])
-        self.num_values = ord(raw[1])
-        self.bitmask = ord(raw[2])
-        # End of file = 1 iff last entry
-        # When it is 1 all others are 0
-        self.eof = ord(raw[3])
-
+    def __init__(self, tag, num_values, bitmask, eof):
+        self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values,
+                bitmask, eof)
+        self.num_of_values = num_values
        self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
                and self.bitmask == 0)

@ -459,13 +457,7 @@ class SecondaryIndexHeader(object): # {{{
            raise ValueError('Invalid TAGX section')
        self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
        self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
-        tag_table = tagx[12:self.tagx_header_length]
-        if len(tag_table) % 4 != 0:
-            raise ValueError('Invalid Tag table')
-        num_tagx_entries = len(tag_table) // 4
-        self.tagx_entries = []
-        for i in range(num_tagx_entries):
-            self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
+        self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
        if self.tagx_entries and not self.tagx_entries[-1].is_eof:
            raise ValueError('TAGX last entry is not EOF')

@ -533,7 +525,8 @@ class IndexHeader(object): # {{{
            raise ValueError('Invalid Primary Index Record')

        self.header_length, = struct.unpack('>I', raw[4:8])
-        self.unknown1 = raw[8:16]
+        self.unknown1 = raw[8:12]
+        self.header_type, = struct.unpack('>I', raw[12:16])
        self.index_type, = struct.unpack('>I', raw[16:20])
        self.index_type_desc = {0: 'normal', 2:
                'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
@ -562,13 +555,7 @@ class IndexHeader(object): # {{{
            raise ValueError('Invalid TAGX section')
        self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
        self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
-        tag_table = tagx[12:self.tagx_header_length]
-        if len(tag_table) % 4 != 0:
-            raise ValueError('Invalid Tag table')
-        num_tagx_entries = len(tag_table) // 4
-        self.tagx_entries = []
-        for i in range(num_tagx_entries):
-            self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
+        self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
        if self.tagx_entries and not self.tagx_entries[-1].is_eof:
            raise ValueError('TAGX last entry is not EOF')

@ -602,6 +589,7 @@ class IndexHeader(object): # {{{

        a('Header length: %d'%self.header_length)
        u(self.unknown1)
+        a('Header type: %d'%self.header_type)
        a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
        a('Offset to IDXT start: %d'%self.idxt_start)
        a('Number of index records: %d'%self.index_count)
@ -661,19 +649,15 @@ class Tag(object): # {{{

    }

-    def __init__(self, tagx, vals, entry_type, cncx):
+    def __init__(self, tag_type, vals, cncx):
        self.value = vals if len(vals) > 1 else vals[0] if vals else None
-        self.entry_type = entry_type
-        tag_type = tagx.tag

        self.cncx_value = None
        if tag_type in self.TAG_MAP:
            self.attr, self.desc = self.TAG_MAP[tag_type]
        else:
-            print ('Unknown tag value: %d in entry type: %s'%(tag_type,
-                entry_type))
-            self.desc = '??Unknown (tag value: %d type: %s)'%(
-                    tag_type, entry_type)
+            print ('Unknown tag value: %%s'%tag_type)
+            self.desc = '??Unknown (tag value: %d)'%tag_type
            self.attr = 'unknown'

        if '_offset' in self.attr:
@ -695,50 +679,13 @@ class IndexEntry(object): # {{{
    used in the navigation UI.
    '''

-    def __init__(self, ident, entry_type, raw, cncx, tagx_entries,
-            control_byte_count):
+    def __init__(self, ident, entry, cncx):
+        try:
+            self.index = int(ident, 16)
+        except ValueError:
            self.index = ident
-        self.raw = raw
-        self.tags = []
-        self.entry_type = entry_type
-        self.byte_size = len(raw)
-
-        orig_raw = raw
-
-        if control_byte_count not in (1, 2):
-            raise ValueError('Unknown control byte count: %d'%
-                    control_byte_count)
-
-        self.flags = 0
-
-        if control_byte_count == 2:
-            self.flags = ord(raw[0])
-            raw = raw[1:]
-
-        expected_tags = [tag for tag in tagx_entries if tag.bitmask &
-                entry_type]
-
-        flags = self.flags
-        for tag in expected_tags:
-            vals = []
-
-            if tag.tag > 0b1000000: # 0b1000000 = 64
-                has_tag = flags & 0b1
-                flags = flags >> 1
-                if not has_tag: continue
-            for i in range(tag.num_values):
-                if not raw:
-                    raise ValueError('Index entry does not match TAGX header')
-                val, consumed = decint(raw)
-                raw = raw[consumed:]
-                vals.append(val)
-            self.tags.append(Tag(tag, vals, self.entry_type, cncx))
-
-        self.consumed = len(orig_raw) - len(raw)
-        self.trailing_bytes = raw
-        if self.trailing_bytes.replace(b'\0', b''):
-            raise ValueError('%s has leftover bytes: %s'%(self, format_bytes(
-                self.trailing_bytes)))
+        self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in
+                entry.iteritems()]

    @property
    def label(self):
@ -797,102 +744,14 @@ class IndexEntry(object): # {{{
        return [0, 0]

    def __str__(self):
-        ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, '
-                'length=%d, byte_size=%d)'%(
-            self.index, bin(self.entry_type), bin(self.flags)[2:],
-            len(self.tags), self.byte_size)]
+        ans = ['Index Entry(index=%s, length=%d)'%(
+            self.index, len(self.tags))]
        for tag in self.tags:
            if tag.value is not None:
                ans.append('\t'+str(tag))
        if self.first_child_index != -1:
            ans.append('\tNumber of children: %d'%(self.last_child_index -
                self.first_child_index + 1))
-        if self.trailing_bytes:
-            ans.append('\tTrailing bytes: %r'%self.trailing_bytes)
-        return '\n'.join(ans)
-
-# }}}
-
-class SecondaryIndexRecord(object): # {{{
-
-    def __init__(self, record, index_header, cncx):
-        self.record = record
-        raw = self.record.raw
-
-        if raw[:4] != b'INDX':
-            raise ValueError('Invalid Primary Index Record')
-
-        u = struct.unpack
-
-        self.header_length, = u('>I', raw[4:8])
-        self.unknown1 = raw[8:12]
-        self.header_type, = u('>I', raw[12:16])
-        self.unknown2 = raw[16:20]
-        self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
-        if self.idxt_offset < 192:
-            raise ValueError('Unknown Index record structure')
-        self.unknown3 = raw[28:36]
-        self.unknown4 = raw[36:192] # Should be 156 bytes
-
-        self.index_offsets = []
-        indices = raw[self.idxt_offset:]
-        if indices[:4] != b'IDXT':
-            raise ValueError("Invalid IDXT index table")
-        indices = indices[4:]
-        for i in range(self.idxt_count):
-            off, = u(b'>H', indices[i*2:(i+1)*2])
-            self.index_offsets.append(off-192)
-        rest = indices[(i+1)*2:]
-        if rest.replace(b'\0', ''): # There can be padding null bytes
-            raise ValueError('Extra bytes after IDXT table: %r'%rest)
-
-        indxt = raw[192:self.idxt_offset]
-        self.size_of_indxt_block = len(indxt)
-
-        self.indices = []
-        for i, off in enumerate(self.index_offsets):
-            try:
-                next_off = self.index_offsets[i+1]
-            except:
-                next_off = len(indxt)
-            num = ord(indxt[off])
-            index = indxt[off+1:off+1+num]
-            consumed = 1 + num
-            entry_type = ord(indxt[off+consumed])
-            pos = off+consumed+1
-            idxe = IndexEntry(index, entry_type,
-                    indxt[pos:next_off], cncx,
-                    index_header.tagx_entries,
-                    index_header.tagx_control_byte_count)
-            self.indices.append(idxe)
-
-        rest = indxt[pos+self.indices[-1].consumed:]
-        if rest.replace(b'\0', b''): # There can be padding null bytes
-            raise ValueError('Extra bytes after IDXT table: %r'%rest)
-
-
-    def __str__(self):
-        ans = ['*'*20 + ' Secondary Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
-        a = ans.append
-        def u(w):
-            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
-                len(w), not bool(w.replace(b'\0', b'')) ))
-        a('Header length: %d'%self.header_length)
-        u(self.unknown1)
-        a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
-        u(self.unknown2)
-        a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
-            self.idxt_offset))
-        a('IDXT Count: %d'%self.idxt_count)
-        u(self.unknown3)
-        u(self.unknown4)
-        a('Index offsets: %r'%self.index_offsets)
-        a('\nIndex Entries (%d entries):'%len(self.indices))
-        for entry in self.indices:
-            a(str(entry))
-            a('')
-
-
        return '\n'.join(ans)

 # }}}
@ -904,58 +763,25 @@ class IndexRecord(object): # {{{
    in the trailing data of the text records.
    '''

-    def __init__(self, record, index_header, cncx):
-        self.record = record
+    def __init__(self, records, index_header, cncx):
        self.alltext = None
-        raw = self.record.raw
+        table = OrderedDict()
+        tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in
+                index_header.tagx_entries]
+        for record in records:
+            raw = record.raw

            if raw[:4] != b'INDX':
                raise ValueError('Invalid Primary Index Record')

-        u = struct.unpack
+            parse_index_record(table, record.raw,
+                    index_header.tagx_control_byte_count, tags,
+                    index_header.index_encoding, strict=True)

-        self.header_length, = u('>I', raw[4:8])
-        self.unknown1 = raw[8:12]
-        self.header_type, = u('>I', raw[12:16])
-        self.unknown2 = raw[16:20]
-        self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
-        if self.idxt_offset < 192:
-            raise ValueError('Unknown Index record structure')
-        self.unknown3 = raw[28:36]
-        self.unknown4 = raw[36:192] # Should be 156 bytes
-
-        self.index_offsets = []
-        indices = raw[self.idxt_offset:]
-        if indices[:4] != b'IDXT':
-            raise ValueError("Invalid IDXT index table")
-        indices = indices[4:]
-        for i in range(self.idxt_count):
-            off, = u(b'>H', indices[i*2:(i+1)*2])
-            self.index_offsets.append(off-192)
-        rest = indices[(i+1)*2:]
-        if rest.replace(b'\0', ''): # There can be padding null bytes
-            raise ValueError('Extra bytes after IDXT table: %r'%rest)
-
-        indxt = raw[192:self.idxt_offset]
-        self.size_of_indxt_block = len(indxt)
        self.indices = []
-        for i, off in enumerate(self.index_offsets):
-            try:
-                next_off = self.index_offsets[i+1]
-            except:
-                next_off = len(indxt)
-            index, consumed = decode_hex_number(indxt[off:])
-            entry_type = ord(indxt[off+consumed])
-            pos = off+consumed+1
-            idxe = IndexEntry(index, entry_type,
-                    indxt[pos:next_off], cncx,
-                    index_header.tagx_entries,
-                    index_header.tagx_control_byte_count)
-            self.indices.append(idxe)

-        rest = indxt[pos+self.indices[-1].consumed:]
-        if rest.replace(b'\0', b''): # There can be padding null bytes
-            raise ValueError('Extra bytes after IDXT table: %r'%rest)
+        for ident, entry in table.iteritems():
+            self.indices.append(IndexEntry(ident, entry, cncx))

    def get_parent(self, index):
        if index.depth < 1:
@ -965,24 +791,12 @@ class IndexRecord(object): # {{{
            if p.depth != parent_depth:
                continue

-
    def __str__(self):
-        ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
+        ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20]
        a = ans.append
        def u(w):
            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
                len(w), not bool(w.replace(b'\0', b'')) ))
-        a('Header length: %d'%self.header_length)
-        u(self.unknown1)
-        a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
-        u(self.unknown2)
-        a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
-            self.idxt_offset))
-        a('IDXT Count: %d'%self.idxt_count)
-        u(self.unknown3)
-        u(self.unknown4)
-        a('Index offsets: %r'%self.index_offsets)
-        a('\nIndex Entries (%d entries):'%len(self.indices))
        for entry in self.indices:
            offset = entry.offset
            a(str(entry))
@ -1157,7 +971,7 @@ class TBSIndexing(object): # {{{

    def get_index(self, idx):
        for i in self.indices:
-            if i.index == idx: return i
+            if i.index in {idx, unicode(idx)}: return i
        raise IndexError('Index %d not found'%idx)

    def __str__(self):
@ -1190,7 +1004,7 @@ class TBSIndexing(object): # {{{
            if entries:
                ans.append('\t%s:'%typ)
                for x in entries:
-                    ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
+                    ans.append(('\t\tIndex Entry: %s (Parent index: %s, '
                            'Depth: %d, Offset: %d, Size: %d) [%s]')%(
                        x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
        def bin4(num):
@ -1287,18 +1101,18 @@ class TBSIndexing(object): # {{{
                        ' when reading starting section'%extra)
            si = self.get_index(si)
            ans.append('The section at the start of this record is:'
-                    ' %d'%si.index)
+                    ' %s'%si.index)
            if 0b0100 in extra:
                num = extra[0b0100]
                ans.append('The number of articles from the section %d'
-                        ' in this record: %d'%(si.index, num))
+                        ' in this record: %s'%(si.index, num))
            elif 0b0001 in extra:
                eof = extra[0b0001]
                if eof != 0:
                    raise ValueError('Unknown eof value %s when reading'
                            ' starting section. All bytes: %r'%(eof, orig))
                ans.append('??This record has more than one article from '
-                        ' the section: %d'%si.index)
+                        ' the section: %s'%si.index)
            return si, byts
        # }}}

@ -1362,21 +1176,23 @@ class MOBIFile(object): # {{{
        pir = self.mobi_header.primary_index_record
        if pir != NULL_INDEX:
            self.index_header = IndexHeader(self.records[pir])
+            numi = self.index_header.index_count
            self.cncx = CNCX(self.records[
-                pir+2:pir+2+self.index_header.num_of_cncx_blocks],
+                pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks],
                self.index_header.index_encoding)
-            self.index_record = IndexRecord(self.records[pir+1],
+            self.index_record = IndexRecord(self.records[pir+1:pir+1+numi],
                    self.index_header, self.cncx)
            self.indexing_record_nums = set(xrange(pir,
-                pir+2+self.index_header.num_of_cncx_blocks))
+                pir+1+numi+self.index_header.num_of_cncx_blocks))
        self.secondary_index_record = self.secondary_index_header = None
        sir = self.mobi_header.secondary_index_record
        if sir != NULL_INDEX:
            self.secondary_index_header = SecondaryIndexHeader(self.records[sir])
+            numi = self.secondary_index_header.index_count
            self.indexing_record_nums.add(sir)
-            self.secondary_index_record = SecondaryIndexRecord(
-                    self.records[sir+1], self.secondary_index_header, self.cncx)
-            self.indexing_record_nums.add(sir+1)
+            self.secondary_index_record = IndexRecord(
+                    self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx)
+            self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi))


        ntr = self.mobi_header.number_of_text_records
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 import struct
-from collections import OrderedDict
+from collections import OrderedDict, namedtuple

-from calibre.ebooks.mobi.utils import decint, count_set_bits
+from calibre.ebooks.mobi.utils import (decint, count_set_bits,
+        decode_string)
+
+TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
+PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')

 class InvalidFile(ValueError):
    pass
@ -37,9 +41,8 @@ def parse_indx_header(data):
            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
    )
    num = len(words)
-    values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
-    header = {words[i]:values[i] for i in xrange(num)}
-    return header
+    values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
+    return dict(zip(words, values))

 class CNCX(object): # {{{

@ -77,101 +80,94 @@ class CNCX(object): # {{{
        return self.records.get(offset, default)
 # }}}

-def parse_tag_section(data):
+def parse_tagx_section(data):
    check_signature(data, b'TAGX')

    tags = []
-    first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
-    control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
+    first_entry_offset, = struct.unpack_from(b'>L', data, 4)
+    control_byte_count, = struct.unpack_from(b'>L', data, 8)

-    # Skip the first 12 bytes already read above.
    for i in xrange(12, first_entry_offset, 4):
-        pos = i
-        tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
-            ord(data[pos+3])))
+        vals = list(bytearray(data[i:i+4]))
+        tags.append(TagX(*vals))
    return control_byte_count, tags

-def get_tag_map(control_byte_count, tags, data, start, end):
+def get_tag_map(control_byte_count, tagx, data, strict=False):
    ptags = []
    ans = {}
-    control_byte_index = 0
-    data_start = start + control_byte_count
+    control_bytes = list(bytearray(data[:control_byte_count]))
+    data = data[control_byte_count:]

-    for tag, values_per_entry, mask, end_flag in tags:
-        if end_flag == 0x01:
-            control_byte_index += 1
+    for x in tagx:
+        if x.eof == 0x01:
+            control_bytes = control_bytes[1:]
            continue
-        value = ord(data[start + control_byte_index]) & mask
+        value = control_bytes[0] & x.bitmask
        if value != 0:
-            if value == mask:
-                if count_set_bits(mask) > 1:
+            value_count = value_bytes = None
+            if value == x.bitmask:
+                if count_set_bits(x.bitmask) > 1:
                    # If all bits of masked value are set and the mask has more
                    # than one bit, a variable width value will follow after
                    # the control bytes which defines the length of bytes (NOT
                    # the value count!) which will contain the corresponding
                    # variable width values.
-                    value, consumed = decint(data[data_start:])
-                    data_start += consumed
-                    ptags.append((tag, None, value, values_per_entry))
+                    value_bytes, consumed = decint(data)
+                    data = data[consumed:]
                else:
-                    ptags.append((tag, 1, None, values_per_entry))
+                    value_count = 1
            else:
                # Shift bits to get the masked value.
-                while mask & 0x01 == 0:
-                    mask = mask >> 1
-                    value = value >> 1
-                ptags.append((tag, value, None, values_per_entry))
-    for tag, value_count, value_bytes, values_per_entry in ptags:
+                mask = x.bitmask
+                while mask & 0b1 == 0:
+                    mask >>= 1
+                    value >>= 1
+                value_count = value
+            ptags.append(PTagX(x.tag, value_count, value_bytes,
+                x.num_of_values))
+
+    for x in ptags:
        values = []
-        if value_count != None:
+        if x.value_count is not None:
            # Read value_count * values_per_entry variable width values.
-            for _ in xrange(value_count*values_per_entry):
-                byts, consumed = decint(data[data_start:])
-                data_start += consumed
+            for _ in xrange(x.value_count * x.num_of_values):
+                byts, consumed = decint(data)
+                data = data[consumed:]
                values.append(byts)
-        else:
+        else: # value_bytes is not None
            # Convert value_bytes to variable width values.
            total_consumed = 0
-            while total_consumed < value_bytes:
+            while total_consumed < x.value_bytes:
                # Does this work for values_per_entry != 1?
-                byts, consumed = decint(data[data_start:])
-                data_start += consumed
+                byts, consumed = decint(data)
+                data = data[consumed:]
                total_consumed += consumed
                values.append(byts)
-            if total_consumed != value_bytes:
-                print ("Error: Should consume %s bytes, but consumed %s" %
-                        (value_bytes, total_consumed))
-        ans[tag] = values
-    # Test that all bytes have been processed if end is given.
-    if end is not None and data_start < end:
-        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
-        rest = data[data_start:end]
-        if rest.replace(b'\0', b''):
-            print ("Warning: There are unprocessed index bytes left: %s" %
-                    format_bytes(rest))
+            if total_consumed != x.value_bytes:
+                err = ("Error: Should consume %s bytes, but consumed %s" %
+                        (x.value_bytes, total_consumed))
+                if strict:
+                    raise ValueError(err)
+                else:
+                    print(err)
+        ans[x.tag] = values
+    # Test that all bytes have been processed
+    if data.replace(b'\0', b''):
+        err = ("Warning: There are unprocessed index bytes left: %s" %
+                format_bytes(data))
+        if strict:
+            raise ValueError(err)
+        else:
+            print(err)

    return ans

-def read_index(sections, idx, codec):
-    table, cncx = OrderedDict(), CNCX([], codec)
-
-    data = sections[idx][0]
-
-    indx_header = parse_indx_header(data)
-    indx_count = indx_header['count']
-
-    if indx_header['ncncx'] > 0:
-        off = idx + indx_count + 1
-        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
-        cncx = CNCX(cncx_records, codec)
-
-    tag_section_start = indx_header['len']
-    control_byte_count, tags = parse_tag_section(data[tag_section_start:])
-
-    for i in xrange(idx + 1, idx + 1 + indx_count):
-        data = sections[i][0]
+def parse_index_record(table, data, control_byte_count, tags, codec,
+        strict=False):
    header = parse_indx_header(data)
    idxt_pos = header['start']
+    if data[idxt_pos:idxt_pos+4] != b'IDXT':
+        print ('WARNING: Invalid INDX record')
    entry_count = header['count']

    # loop through to build up the IDXT position starts
@ -187,11 +183,32 @@ def read_index(sections, idx, codec):
    # text
    for j in xrange(entry_count):
        start, end = idx_positions[j:j+2]
-            text_length = ord(data[start])
-            text = data[start+1:start+1+text_length]
-            tag_map = get_tag_map(control_byte_count, tags, data,
-                    start+1+text_length, end)
-            table[text] = tag_map
+        rec = data[start:end]
+        ident, consumed = decode_string(rec, codec=codec)
+        rec = rec[consumed:]
+        tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
+        table[ident] = tag_map

+
+def read_index(sections, idx, codec):
+    table, cncx = OrderedDict(), CNCX([], codec)
+
+    data = sections[idx][0]
+
+    indx_header = parse_indx_header(data)
+    indx_count = indx_header['count']
+
+    if indx_header['ncncx'] > 0:
+        off = idx + indx_count + 1
+        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
+        cncx = CNCX(cncx_records, codec)
+
+    tag_section_start = indx_header['len']
+    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
+
+    for i in xrange(idx + 1, idx + 1 + indx_count):
+        # Index record
+        data = sections[i][0]
+        parse_index_record(table, data, control_byte_count, tags, codec)
    return table, cncx

--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -317,6 +317,7 @@ class Mobi8Reader(object):
        for entry in index_entries:
            pos = entry['pos']
            fi = self.get_file_info(pos)
+            #print (11111111, fi, entry['pos_fid'])
            if fi.filename is None:
                raise ValueError('Index entry has invalid pos: %d'%pos)
            idtag = self.get_id_tag(pos).decode(self.header.codec)
--- a/src/calibre/ebooks/mobi/reader/ncx.py
+++ b/src/calibre/ebooks/mobi/reader/ncx.py
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
 import os

 from calibre.ebooks.metadata.toc import TOC
-from calibre.ebooks.mobi.utils import to_base
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import read_index

@ -23,7 +22,30 @@ tag_fieldname_map = {
        6:  ['pos_fid',0],
        21: ['parent',0],
        22: ['child1',0],
-        23: ['childn',0]
+        23: ['childn',0],
+        69: ['image_index',0],
+        70 : ['desc_offset', 0], # 'Description offset in cncx'
+        71 : ['author_offset', 0], # 'Author offset in cncx'
+        72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
+        73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
+
+}
+
+default_entry = {
+                    'pos':  -1,
+                    'len':  0,
+                    'noffs': -1,
+                    'text' : "Unknown Text",
+                    'hlvl' : -1,
+                    'kind' : "Unknown Class",
+                    'pos_fid' : None,
+                    'parent' : -1,
+                    'child1' : -1,
+                    'childn' : -1,
+                    'description': None,
+                    'author': None,
+                    'image_caption': None,
+                    'image_attribution': None,
 }

 def read_ncx(sections, index, codec):
@ -34,32 +56,25 @@ def read_ncx(sections, index, codec):

        for num, x in enumerate(table.iteritems()):
            text, tag_map = x
-            entry = {
-                    'name': text,
-                    'pos':  -1,
-                    'len':  0,
-                    'noffs': -1,
-                    'text' : "Unknown Text",
-                    'hlvl' : -1,
-                    'kind' : "Unknown Kind",
-                    'pos_fid' : None,
-                    'parent' : -1,
-                    'child1' : -1,
-                    'childn' : -1,
-                    'num'  : num
-            }
+            entry = default_entry.copy()
+            entry['name'] = text
+            entry['num'] = num

-            for tag in tag_fieldname_map.keys():
+            for tag in tag_fieldname_map.iterkeys():
                fieldname, i = tag_fieldname_map[tag]
                if tag in tag_map:
                    fieldvalue = tag_map[tag][i]
                    if tag == 6:
-                        fieldvalue = to_base(fieldvalue, base=32)
+                        # Appears to be an idx into the KF8 elems table with an
+                        # offset
+                        fieldvalue = tuple(tag_map[tag])
                    entry[fieldname] = fieldvalue
-                    if tag == 3:
-                        entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
-                    if tag == 5:
-                        entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
+                    for which, name in {3:'text', 5:'kind', 70:'description',
+                            71:'author', 72:'image_caption',
+                            73:'image_attribution'}.iteritems():
+                        if tag == which:
+                            entry[name] = cncx.get(fieldvalue,
+                                    default_entry[name])
            index_entries.append(entry)

    return index_entries
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -15,7 +15,13 @@ from calibre.ebooks import normalize

 IMAGE_MAX_SIZE = 10 * 1024 * 1024

-def decode_hex_number(raw):
+def decode_string(raw, codec='utf-8'):
+    length, = struct.unpack(b'>B', raw[0])
+    raw = raw[1:1+length]
+    consumed = length+1
+    return raw.decode(codec), consumed
+
+def decode_hex_number(raw, codec='utf-8'):
    '''
    Return a variable length number encoded using hexadecimal encoding. These
    numbers have the first byte which tells the number of bytes that follow.
@ -25,13 +31,16 @@ def decode_hex_number(raw):
    :param raw: Raw binary data as a bytestring

    :return: The number and the number of bytes from raw that the number
-    occupies
+    occupies.
    '''
-    length, = struct.unpack(b'>B', raw[0])
-    raw = raw[1:1+length]
-    consumed = length+1
+    raw, consumed = decode_string(raw, codec=codec)
    return int(raw, 16), consumed

+def encode_string(raw):
+    ans = bytearray(bytes(raw))
+    ans.insert(0, len(ans))
+    return bytes(ans)
+
 def encode_number_as_hex(num):
    '''
    Encode num as a variable length encoded hexadecimal number. Returns the
@ -44,9 +53,7 @@ def encode_number_as_hex(num):
    nlen = len(num)
    if nlen % 2 != 0:
        num = b'0'+num
-    ans = bytearray(num)
-    ans.insert(0, len(num))
-    return bytes(ans)
+    return encode_string(num)

 def encint(value, forward=True):
    '''