Mobi debug: Decompile CTOC and fix interpretation of index entries

This commit is contained in:
Kovid Goyal 2011-07-18 19:32:46 -06:00
parent ca2c41516a
commit 79ca569caa

View File

@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, datetime, sys, os
from collections import OrderedDict
from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint
@ -509,10 +510,11 @@ class IndexEntry(object): # {{{
0x3f : 'article',
}
def __init__(self, ident, entry_type, raw):
def __init__(self, ident, entry_type, raw, is_last):
self.id = ident
self.fields = []
self.sub_type = None
self.raw = raw
try:
self.entry_type = self.TYPES[entry_type]
@ -522,19 +524,17 @@ class IndexEntry(object): # {{{
if self.entry_type in (0xdf, 0xff):
self.subtype = ord(raw[0])
raw = raw[1:]
while True:
while raw:
val, consumed = decint(raw)
raw = raw[consumed:]
if val == 0:
break
else:
self.fields.append(val)
if is_last and self.fields[-1] == 0:
self.fields = self.fields[:-1]
def __str__(self):
ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s)'%(
self.id, self.entry_type, self.sub_type)]
ans.append('\tFields: %r'%self.fields)
ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s, length=%d)'%(
self.id, self.entry_type, self.sub_type, len(self.raw))]
ans.append('\tFields (%d): %r'%(len(self.fields), self.fields))
return '\n'.join(ans)
# }}}
@ -570,12 +570,17 @@ class IndexRecord(object): # {{{
indxt = raw[192:self.idxt_offset]
self.indices = []
for off in self.index_offsets:
index = indxt[off:]
ident, consumed = decode_hex_number(index)
index = index[consumed:]
entry_type, = u(b'>B', index[0])
self.indices.append(IndexEntry(ident, entry_type, index[1:]))
for i, off in enumerate(self.index_offsets):
try:
next_off = self.index_offsets[i+1]
is_last = False
except:
next_off = len(indxt)
is_last = True
ident, consumed = decode_hex_number(indxt[off:])
entry_type, = u(b'>B', indxt[off+consumed])
self.indices.append(IndexEntry(ident, entry_type,
indxt[off+consumed+1:next_off], is_last))
def __str__(self):
@ -601,6 +606,29 @@ class IndexRecord(object): # {{{
# }}}
class CTOC(object) : # {{{
def __init__(self, records, codec):
self.records = OrderedDict()
pos = 0
for record in records:
raw = record.raw
while pos < len(raw):
length, consumed = decint(raw[pos:])
if length > 0:
self.records[pos] = raw[pos+consumed:pos+consumed+length].decode(
codec)
pos += consumed+length
def __str__(self):
ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20]
for k, v in self.records.iteritems():
ans.append('%10d : %s'%(k, v))
return '\n'.join(ans)
# }}}
class MOBIFile(object): # {{{
def __init__(self, stream):
@ -633,6 +661,9 @@ class MOBIFile(object): # {{{
pir = self.mobi_header.primary_index_record
if pir != 0xffffffff:
self.index_header = IndexHeader(self.records[pir])
self.ctoc = CTOC(self.records[
pir+2:pir+2+self.index_header.num_of_ctoc_blocks],
self.index_header.index_encoding)
self.index_record = IndexRecord(self.records[pir+1])
@ -660,6 +691,8 @@ def inspect_mobi(path_or_stream):
with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
print(str(f.index_header), file=out)
print('\n\n', file=out)
print(str(f.ctoc).encode('utf-8'), file=out)
print('\n\n', file=out)
print(str(f.index_record), file=out)
print ('Debug data saved to:', ddir)