Mobi debug: Dump KF8 SKEL and SECT indices

This commit is contained in:
Kovid Goyal 2012-04-11 15:44:40 +05:30
parent 5fd415ea2d
commit 94ff0c64d5
3 changed files with 146 additions and 0 deletions

View File

@ -0,0 +1,126 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import OrderedDict, namedtuple
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header,
parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS)
File = namedtuple('File',
'file_number name divtbl_count start_position length')
Elem = namedtuple('Elem',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
data = sections[idx].raw
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['tagx']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
# Index record
data = sections[i].raw
parse_index_record(table, data, control_byte_count, tags, codec,
indx_header['ordt_map'], strict=True)
return table, cncx, indx_header
class Index(object):
def __init__(self, idx, records, codec):
self.table = self.cncx = self.header = self.records = None
if idx != NULL_INDEX:
self.table, self.cncx, self.header = read_index(records, idx, codec)
def render(self):
ans = ['*'*10 + ' Index Header ' + '*'*10]
a = ans.append
if self.header is not None:
for field in INDEX_HEADER_FIELDS:
a('%-12s: %r'%(field, self.header[field]))
ans.extend(['', ''])
if self.cncx:
a('*'*10 + ' CNCX ' + '*'*10)
for offset, val in self.cncx.iteritems():
a('%10s: %s'%(offset, val))
ans.extend(['', ''])
if self.table is not None:
a('*'*10 + ' %d Index Entries '%len(self.table) + '*'*10)
for k, v in self.table.iteritems():
a('%s: %r'%(k, v))
if self.records:
ans.extend(['', '', '*'*10 + ' Parsed Entries ' + '*'*10])
for f in self.records:
a(repr(f))
return ans + ['']
def __str__(self):
return '\n'.join(self.render())
class SKELIndex(Index):
def __init__(self, skelidx, records, codec):
super(SKELIndex, self).__init__(skelidx, records, codec)
self.records = []
if self.table is not None:
for i, text in enumerate(self.table.iterkeys()):
tag_map = self.table[text]
if set(tag_map.iterkeys()) != {1, 6}:
raise ValueError('SKEL Index has unknown tags: %s'%
(set(tag_map.iterkeys())-{1,6}))
self.records.append(File(
i, # file_number
text, # name
tag_map[1][0], # divtbl_count
tag_map[6][0], # start_pos
tag_map[6][1]) # length
)
class SECTIndex(Index):
def __init__(self, sectidx, records, codec):
super(SECTIndex, self).__init__(sectidx, records, codec)
self.records = []
if self.table is not None:
for i, text in enumerate(self.table.iterkeys()):
tag_map = self.table[text]
if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
raise ValueError('SECT Index has unknown tags: %s'%
(set(tag_map.iterkeys())-{2, 3, 4, 6}))
toc_text = self.cncx[tag_map[2][0]]
self.records.append(Elem(
int(text), # insert_pos
toc_text, # toc_text
tag_map[3][0], # file_number
tag_map[4][0], # sequence_number
tag_map[6][0], # start_pos
tag_map[6][1] # length
)
)

View File

@ -11,6 +11,7 @@ import sys, os, imghdr, struct
from itertools import izip
from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex)
from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
@ -65,6 +66,7 @@ class MOBIFile(object):
self.header = self.mf.mobi8_header
self.extract_resources()
self.read_fdst()
self.read_indices()
def print_header(self, f=sys.stdout):
print (str(self.mf.palmdb).encode('utf-8'), file=f)
@ -85,6 +87,12 @@ class MOBIFile(object):
if self.fdst.num_sections != self.header.fdst_count:
raise ValueError('KF8 Header contains invalid FDST count')
def read_indices(self):
self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records,
self.header.encoding)
self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records,
self.header.encoding)
def extract_resources(self):
self.resource_map = []
known_types = {b'FLIS', b'FCIS', b'SRCS',
@ -145,3 +153,9 @@ def inspect_mobi(mobi_file, ddir):
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
fo.write(str(f.fdst).encode('utf-8'))
with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
fo.write(str(f.skel_index).encode('utf-8'))
with open(os.path.join(ddir, 'sect.record'), 'wb') as fo:
fo.write(str(f.sect_index).encode('utf-8'))

View File

@ -111,6 +111,12 @@ class CNCX(object): # {{{
def get(self, offset, default=None):
return self.records.get(offset, default)
def __bool__(self):
return bool(self.records)
def iteritems(self):
return self.records.iteritems()
# }}}
def parse_tagx_section(data):