mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Mobi debug: Figured out the TAGX table, use it to properly decode the index entries
This commit is contained in:
parent
08dff7d722
commit
55987fa6cb
@ -377,18 +377,17 @@ class TagX(object): # {{{
|
||||
def __init__(self, raw, control_byte_count):
|
||||
self.tag = ord(raw[0])
|
||||
self.num_values = ord(raw[1])
|
||||
self.bmask = ord(raw[2])
|
||||
self.bitmask = bin(self.bmask)
|
||||
self.bitmask = ord(raw[2])
|
||||
# End of file = 1 iff last entry
|
||||
# When it is 1 all others are 0
|
||||
self.eof = ord(raw[3])
|
||||
|
||||
self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
|
||||
and self.bmask == 0)
|
||||
and self.bitmask == 0)
|
||||
|
||||
def __repr__(self):
|
||||
return 'TAGX(tag=%02d, num_values=%d, bitmask=%r (%d), eof=%d)' % (self.tag,
|
||||
self.num_values, self.bitmask, self.bmask, self.eof)
|
||||
return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag,
|
||||
self.num_values, bin(self.bitmask), self.eof)
|
||||
# }}}
|
||||
|
||||
class IndexHeader(object): # {{{
|
||||
@ -444,6 +443,7 @@ class IndexHeader(object): # {{{
|
||||
self.tagx_control_byte_count))
|
||||
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
||||
raise ValueError('TAGX last entry is not EOF')
|
||||
self.tagx_entries = self.tagx_entries[:-1]
|
||||
|
||||
idxt0_pos = self.header_length+self.tagx_header_length
|
||||
last_num, consumed = decode_hex_number(raw[idxt0_pos:])
|
||||
@ -497,6 +497,81 @@ class IndexHeader(object): # {{{
|
||||
return '\n'.join(ans)
|
||||
# }}}
|
||||
|
||||
class Tag(object): # {{{
|
||||
|
||||
'''
|
||||
Index entries are a collection of tags. Each tag is represented by this
|
||||
class.
|
||||
'''
|
||||
|
||||
TAG_MAP = {
|
||||
1: ('offset', 'Offset in HTML'),
|
||||
2: ('size', 'Size in HTML'),
|
||||
3: ('label_offset', 'Offset to label in CNCX'),
|
||||
4: ('depth', 'Depth of this entry in TOC'),
|
||||
|
||||
# The remaining tag types have to be interpreted subject to the type
|
||||
# of index entry they are present in
|
||||
}
|
||||
|
||||
INTERPRET_MAP = {
|
||||
'subchapter': {
|
||||
5 : ('Parent chapter index', 'parent_index')
|
||||
},
|
||||
|
||||
'article' : {
|
||||
5 : ('Class offset in CTOC', 'class_offset'),
|
||||
21 : ('Parent section index', 'parent_index'),
|
||||
22 : ('Description offset in CTOC', 'desc_offset'),
|
||||
23 : ('Author offset in CTOC', 'author_offset'),
|
||||
},
|
||||
|
||||
'chapter_with_subchapters' : {
|
||||
22 : ('First subchapter index', 'first_subchapter_index'),
|
||||
23 : ('Last subchapter index', 'last_subchapter_index'),
|
||||
},
|
||||
|
||||
'periodical' : {
|
||||
5 : ('Class offset in CTOC', 'class_offset'),
|
||||
22 : ('First section index', 'first_section_index'),
|
||||
23 : ('Last section index', 'last_section_index'),
|
||||
},
|
||||
|
||||
'section' : {
|
||||
5 : ('Class offset in CTOC', 'class_offset'),
|
||||
21 : ('Periodical index', 'periodical_index'),
|
||||
22 : ('First article index', 'first_article_index'),
|
||||
23 : ('Last article index', 'last_article_index'),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def __init__(self, tagx, vals, entry_type, ctoc):
|
||||
self.value = vals if len(vals) > 1 else vals[0]
|
||||
self.entry_type = entry_type
|
||||
self.ctoc_value = None
|
||||
if tagx.tag in self.TAG_MAP:
|
||||
self.attr, self.desc = self.TAG_MAP[tagx.tag]
|
||||
else:
|
||||
try:
|
||||
td = self.INTERPRET_MAP[entry_type]
|
||||
except:
|
||||
raise ValueError('Unknown entry type: %s'%entry_type)
|
||||
try:
|
||||
self.desc, self.attr = td[tagx.tag]
|
||||
except:
|
||||
raise ValueError('Unknown tag: %d for entry type: %s'%(
|
||||
tagx.tag, entry_type))
|
||||
if '_offset' in self.attr:
|
||||
self.ctoc_value = ctoc[self.value]
|
||||
|
||||
def __str__(self):
|
||||
if self.ctoc_value is not None:
|
||||
return '%s : %r [%r]'%(self.desc, self.value, self.ctoc_value)
|
||||
return '%s : %r'%(self.desc, self.value)
|
||||
|
||||
# }}}
|
||||
|
||||
class IndexEntry(object): # {{{
|
||||
|
||||
TYPES = {
|
||||
@ -510,97 +585,41 @@ class IndexEntry(object): # {{{
|
||||
0x3f : 'article',
|
||||
}
|
||||
|
||||
def __init__(self, ident, entry_type, raw, is_last):
|
||||
def __init__(self, ident, entry_type, raw, ctoc, tagx_entries):
|
||||
self.index = ident
|
||||
self.fields = []
|
||||
self.sub_type = None
|
||||
self.raw = raw
|
||||
self.tags = []
|
||||
|
||||
try:
|
||||
self.entry_type = self.TYPES[entry_type]
|
||||
except KeyError:
|
||||
raise ValueError('Unknown Index Entry type: %s'%hex(entry_type))
|
||||
|
||||
if self.entry_type in (0xdf, 0xff):
|
||||
self.subtype = ord(raw[0])
|
||||
raw = raw[1:]
|
||||
while raw:
|
||||
expected_tags = [tag for tag in tagx_entries if tag.bitmask &
|
||||
entry_type]
|
||||
|
||||
for tag in expected_tags:
|
||||
vals = []
|
||||
for i in range(tag.num_values):
|
||||
if not raw:
|
||||
raise ValueError('Index entry does not match TAGX header')
|
||||
val, consumed = decint(raw)
|
||||
raw = raw[consumed:]
|
||||
self.fields.append(val)
|
||||
if is_last and self.fields[-1] == 0:
|
||||
self.fields = self.fields[:-1]
|
||||
|
||||
self.interpret()
|
||||
|
||||
def interpret(self):
|
||||
self.offset = self.fields[0]
|
||||
self.object_size = self.fields[1]
|
||||
self.label_offset = self.fields[2]
|
||||
self.depth = self.fields[3]
|
||||
self.extra = OrderedDict()
|
||||
self.extra_fields = []
|
||||
if self.entry_type == 'subchapter':
|
||||
self.parent_index = self.fields[4]
|
||||
self.extra['Parent chapter index'] = 'parent_index'
|
||||
self.extra_fields = self.fields[5:]
|
||||
elif self.entry_type == 'article':
|
||||
self.class_offset = self.fields[4]
|
||||
self.extra['Class offset in CTOC'] = 'class_offset'
|
||||
self.parent_index = self.fields[5]
|
||||
self.extra['Parent section index'] = 'parent_index'
|
||||
if len(self.fields) > 6:
|
||||
self.desc_offset = self.fields[6]
|
||||
self.extra['Decription offset in CTOC'] = 'desc_offset'
|
||||
if len(self.fields) > 7:
|
||||
self.author_offset = self.fields[7]
|
||||
self.extra['Author offset in CTOC'] = 'author_offset'
|
||||
self.extra_fields = self.fields[8:]
|
||||
elif self.entry_type == 'chapter_with_subchapters':
|
||||
self.first_subchapter_index = self.fields[4]
|
||||
self.last_subchapter_index = self.fields[5]
|
||||
self.extra['First subchapter index'] = 'first_subchapter_index'
|
||||
self.extra['Last subchapter index'] = 'last_subchapter_index'
|
||||
self.extra_fields = self.fields[6:]
|
||||
elif self.entry_type == 'periodical':
|
||||
self.class_offset = self.fields[4]
|
||||
self.extra['Class offset in CTOC'] = 'class_offset'
|
||||
self.first_section_index = self.fields[5]
|
||||
self.last_section_index = self.fields[6]
|
||||
self.extra['First section index'] = 'first_section_index'
|
||||
self.extra['Last section index'] = 'last_section_index'
|
||||
self.extra_fields = self.fields[7:]
|
||||
elif self.entry_type == 'section':
|
||||
self.class_offset = self.fields[4]
|
||||
self.extra['Class offset in CTOC'] = 'class_offset'
|
||||
self.periodical_index = self.fields[5]
|
||||
self.extra['Periodical index'] = 'periodical_index'
|
||||
self.first_article_index = self.fields[6]
|
||||
self.last_article_index = self.fields[7]
|
||||
self.extra['First article index'] = 'first_article_index'
|
||||
self.extra['Last article index'] = 'last_article_index'
|
||||
self.extra_fields = self.fields[8:]
|
||||
vals.append(val)
|
||||
self.tags.append(Tag(tag, vals, self.entry_type, ctoc))
|
||||
|
||||
def __str__(self):
|
||||
ans = ['Index Entry(index=%s, entry_type=%s, sub_type=%s, length=%d)'%(
|
||||
self.index, self.entry_type, self.sub_type, len(self.raw))]
|
||||
ans.append('\tOffset in HTML: %d'%self.offset)
|
||||
ans.append('\tObject size in HTML: %d'%self.object_size)
|
||||
ans.append('\tLabel offset in CTOC: %d'%self.label_offset)
|
||||
ans.append('\tDepth: %d'%self.depth)
|
||||
for text, attr in self.extra.iteritems():
|
||||
ans.append('\t%s: %d'%(text, getattr(self, attr)))
|
||||
if self.extra_fields:
|
||||
ans.append('\tExtra Fields (%d): %r'%(len(self.extra_fields),
|
||||
self.extra_fields))
|
||||
|
||||
ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
|
||||
self.index, self.entry_type, len(self.tags))]
|
||||
for tag in self.tags:
|
||||
ans.append('\t'+str(tag))
|
||||
return '\n'.join(ans)
|
||||
|
||||
# }}}
|
||||
|
||||
class IndexRecord(object): # {{{
|
||||
|
||||
def __init__(self, record):
|
||||
def __init__(self, record, index_header, ctoc):
|
||||
self.record = record
|
||||
raw = self.record.raw
|
||||
if raw[:4] != b'INDX':
|
||||
@ -632,14 +651,12 @@ class IndexRecord(object): # {{{
|
||||
for i, off in enumerate(self.index_offsets):
|
||||
try:
|
||||
next_off = self.index_offsets[i+1]
|
||||
is_last = False
|
||||
except:
|
||||
next_off = len(indxt)
|
||||
is_last = True
|
||||
index, consumed = decode_hex_number(indxt[off:])
|
||||
entry_type, = u(b'>B', indxt[off+consumed])
|
||||
entry_type = ord(indxt[off+consumed])
|
||||
self.indices.append(IndexEntry(index, entry_type,
|
||||
indxt[off+consumed+1:next_off], is_last))
|
||||
indxt[off+consumed+1:next_off], ctoc, index_header.tagx_entries))
|
||||
|
||||
|
||||
def __str__(self):
|
||||
@ -679,6 +696,9 @@ class CTOC(object) : # {{{
|
||||
codec)
|
||||
pos += consumed+length
|
||||
|
||||
def __getitem__(self, offset):
|
||||
return self.records.get(offset)
|
||||
|
||||
def __str__(self):
|
||||
ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20]
|
||||
for k, v in self.records.iteritems():
|
||||
@ -723,7 +743,8 @@ class MOBIFile(object): # {{{
|
||||
self.ctoc = CTOC(self.records[
|
||||
pir+2:pir+2+self.index_header.num_of_ctoc_blocks],
|
||||
self.index_header.index_encoding)
|
||||
self.index_record = IndexRecord(self.records[pir+1])
|
||||
self.index_record = IndexRecord(self.records[pir+1],
|
||||
self.index_header, self.ctoc)
|
||||
|
||||
|
||||
def print_header(self, f=sys.stdout):
|
||||
|
Loading…
x
Reference in New Issue
Block a user