Support KF8 files in inspect mobi tool

This commit is contained in:
Kovid Goyal 2012-03-16 10:56:24 +05:30
parent 21f493e561
commit b8d0c49f99

View File

@ -405,7 +405,7 @@ class MOBIHeader(object): # {{{
class TagX(object): # {{{
def __init__(self, raw, control_byte_count):
def __init__(self, raw):
self.tag = ord(raw[0])
self.num_values = ord(raw[1])
self.bitmask = ord(raw[2])
@ -465,8 +465,7 @@ class SecondaryIndexHeader(object): # {{{
num_tagx_entries = len(tag_table) // 4
self.tagx_entries = []
for i in range(num_tagx_entries):
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4],
self.tagx_control_byte_count))
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
raise ValueError('TAGX last entry is not EOF')
@ -569,8 +568,7 @@ class IndexHeader(object): # {{{
num_tagx_entries = len(tag_table) // 4
self.tagx_entries = []
for i in range(num_tagx_entries):
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4],
self.tagx_control_byte_count))
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
raise ValueError('TAGX last entry is not EOF')
@ -640,74 +638,29 @@ class Tag(object): # {{{
TAG_MAP = {
1: ('offset', 'Offset in HTML'),
2: ('size', 'Size in HTML'),
3: ('label_offset', 'Offset to label in CNCX'),
3: ('label_offset', 'Label offset in CNCX'),
4: ('depth', 'Depth of this entry in TOC'),
5: ('class_offset', 'Class offset in CNCX'),
6: ('pos_fid', 'File Index'),
11: ('secondary', '[unknown, unknown, '
'tag type from TAGX in primary index header]'),
# The remaining tag types have to be interpreted subject to the type
# of index entry they are present in
21: ('parent_index', 'Parent'),
22: ('first_child_index', 'First child'),
23: ('last_child_index', 'Last child'),
69 : ('image_index', 'Offset from first image record to the'
' image record associated with this entry'
' (masthead for periodical or thumbnail for'
' article entry).'),
70 : ('desc_offset', 'Description offset in cncx'),
71 : ('author_offset', 'Author offset in cncx'),
72 : ('image_caption_offset', 'Image caption offset in cncx'),
73 : ('image_attr_offset', 'Image attribution offset in cncx'),
}
INTERPRET_MAP = {
'subchapter': {
21 : ('Parent chapter index', 'parent_index'),
5 : ('Unknown (koffs)', 'koffs'),
},
'section' : {
6 : ('File Index', 'pos_fid'),
},
'section_with_subsections' : {
6 : ('File Index', 'pos_fid'),
22 : ('First subsection index', 'first_child_index'),
23 : ('Last subsection index', 'last_child_index'),
},
'subsection' : {
6 : ('File Index', 'pos_fid'),
21 : ('Parent section index', 'parent_index'),
},
'article' : {
5 : ('Class offset in cncx', 'class_offset'),
21 : ('Parent section index', 'parent_index'),
69 : ('Offset from first image record num to the'
' image record associated with this article',
'image_index'),
70 : ('Description offset in cncx', 'desc_offset'),
71 : ('Author offset in cncx', 'author_offset'),
72 : ('Image caption offset in cncx',
'image_caption_offset'),
73 : ('Image attribution offset in cncx',
'image_attr_offset'),
},
'chapter_with_subchapters' : {
22 : ('First subchapter index', 'first_child_index'),
23 : ('Last subchapter index', 'last_child_index'),
},
'periodical' : {
5 : ('Class offset in cncx', 'class_offset'),
22 : ('First section index', 'first_child_index'),
23 : ('Last section index', 'last_child_index'),
69 : ('Offset from first image record num to masthead'
' record', 'image_index'),
},
'section' : {
5 : ('Class offset in cncx', 'class_offset'),
21 : ('Periodical index', 'parent_index'),
22 : ('First article index', 'first_child_index'),
23 : ('Last article index', 'last_child_index'),
},
}
def __init__(self, tagx, vals, entry_type, cncx):
self.value = vals if len(vals) > 1 else vals[0] if vals else None
self.entry_type = entry_type
@ -717,24 +670,15 @@ class Tag(object): # {{{
if tag_type in self.TAG_MAP:
self.attr, self.desc = self.TAG_MAP[tag_type]
else:
try:
td = self.INTERPRET_MAP[entry_type]
except:
raise ValueError('Unknown entry type: %s'%entry_type)
try:
self.desc, self.attr = td[tag_type]
except:
print ('Unknown tag value: %d in entry type: %s'%(tag_type,
entry_type))
self.desc = '??Unknown (tag value: %d type: %s)'%(
tag_type, entry_type)
self.attr = 'unknown'
print ('Unknown tag value: %d in entry type: %s'%(tag_type,
entry_type))
self.desc = '??Unknown (tag value: %d type: %s)'%(
tag_type, entry_type)
self.attr = 'unknown'
if '_offset' in self.attr:
self.cncx_value = cncx[self.value]
if self.attr == 'pos_fid':
print (1111111, vals)
def __str__(self):
if self.cncx_value is not None:
return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value)
@ -746,43 +690,21 @@ class IndexEntry(object): # {{{
'''
The index is made up of entries, each of which is represented by an
instance of this class. Index entries typically point to offsets int eh
instance of this class. Index entries typically point to offsets in the
HTML, specify HTML sizes and point to text strings in the CNCX that are
used in the navigation UI.
'''
TYPES = {
# Present in secondary index record
0x01 : 'null',
0x02 : 'publication_meta',
# Present in book type files
0x0f : 'chapter',
0x6f : 'chapter_with_subchapters',
0x1f : 'subchapter',
0x8f : 'section',
0xef : 'section_with_subsections',
0x9f : 'subsection',
# Present in periodicals
0xdf : 'periodical',
0xff : 'section',
0x3f : 'article',
}
def __init__(self, ident, entry_type, raw, cncx, tagx_entries,
control_byte_count):
self.index = ident
self.raw = raw
self.tags = []
self.entry_type_raw = entry_type
self.entry_type = entry_type
self.byte_size = len(raw)
orig_raw = raw
try:
self.entry_type = self.TYPES[entry_type]
except KeyError:
raise ValueError('Unknown Index Entry type: %s'%bin(entry_type))
if control_byte_count not in (1, 2):
raise ValueError('Unknown control byte count: %d'%
control_byte_count)
@ -800,7 +722,7 @@ class IndexEntry(object): # {{{
for tag in expected_tags:
vals = []
if tag.tag > 64:
if tag.tag > 0b1000000: # 0b1000000 = 64
has_tag = flags & 0b1
flags = flags >> 1
if not has_tag: continue
@ -867,10 +789,17 @@ class IndexEntry(object): # {{{
return tag.value
return -1
@property
def pos_fid(self):
for tag in self.tags:
if tag.attr == 'pos_fid':
return tag.value
return [0, 0]
def __str__(self):
ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, '
'length=%d, byte_size=%d)'%(
self.index, self.entry_type, bin(self.flags)[2:],
self.index, bin(self.entry_type), bin(self.flags)[2:],
len(self.tags), self.byte_size)]
for tag in self.tags:
if tag.value is not None: