Mobi debug: Decompile CTOC and fix interpretation of index entries

This commit is contained in:
Kovid Goyal 2011-07-18 19:32:46 -06:00
parent ca2c41516a
commit 79ca569caa

View File

@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct, datetime, sys, os import struct, datetime, sys, os
from collections import OrderedDict
from calibre.utils.date import utc_tz from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint
@ -509,10 +510,11 @@ class IndexEntry(object): # {{{
0x3f : 'article', 0x3f : 'article',
} }
def __init__(self, ident, entry_type, raw): def __init__(self, ident, entry_type, raw, is_last):
self.id = ident self.id = ident
self.fields = [] self.fields = []
self.sub_type = None self.sub_type = None
self.raw = raw
try: try:
self.entry_type = self.TYPES[entry_type] self.entry_type = self.TYPES[entry_type]
@ -522,19 +524,17 @@ class IndexEntry(object): # {{{
if self.entry_type in (0xdf, 0xff): if self.entry_type in (0xdf, 0xff):
self.subtype = ord(raw[0]) self.subtype = ord(raw[0])
raw = raw[1:] raw = raw[1:]
while True: while raw:
val, consumed = decint(raw) val, consumed = decint(raw)
raw = raw[consumed:] raw = raw[consumed:]
if val == 0:
break
else:
self.fields.append(val) self.fields.append(val)
if is_last and self.fields[-1] == 0:
self.fields = self.fields[:-1]
def __str__(self): def __str__(self):
ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s)'%( ans = ['Index Entry(id=%s, entry_type=%s, sub_type=%s, length=%d)'%(
self.id, self.entry_type, self.sub_type)] self.id, self.entry_type, self.sub_type, len(self.raw))]
ans.append('\tFields: %r'%self.fields) ans.append('\tFields (%d): %r'%(len(self.fields), self.fields))
return '\n'.join(ans) return '\n'.join(ans)
# }}} # }}}
@ -570,12 +570,17 @@ class IndexRecord(object): # {{{
indxt = raw[192:self.idxt_offset] indxt = raw[192:self.idxt_offset]
self.indices = [] self.indices = []
for off in self.index_offsets: for i, off in enumerate(self.index_offsets):
index = indxt[off:] try:
ident, consumed = decode_hex_number(index) next_off = self.index_offsets[i+1]
index = index[consumed:] is_last = False
entry_type, = u(b'>B', index[0]) except:
self.indices.append(IndexEntry(ident, entry_type, index[1:])) next_off = len(indxt)
is_last = True
ident, consumed = decode_hex_number(indxt[off:])
entry_type, = u(b'>B', indxt[off+consumed])
self.indices.append(IndexEntry(ident, entry_type,
indxt[off+consumed+1:next_off], is_last))
def __str__(self): def __str__(self):
@ -601,6 +606,29 @@ class IndexRecord(object): # {{{
# }}} # }}}
class CTOC(object) : # {{{
def __init__(self, records, codec):
self.records = OrderedDict()
pos = 0
for record in records:
raw = record.raw
while pos < len(raw):
length, consumed = decint(raw[pos:])
if length > 0:
self.records[pos] = raw[pos+consumed:pos+consumed+length].decode(
codec)
pos += consumed+length
def __str__(self):
ans = ['*'*20 + ' CTOC (%d strings) '%len(self.records)+ '*'*20]
for k, v in self.records.iteritems():
ans.append('%10d : %s'%(k, v))
return '\n'.join(ans)
# }}}
class MOBIFile(object): # {{{ class MOBIFile(object): # {{{
def __init__(self, stream): def __init__(self, stream):
@ -633,6 +661,9 @@ class MOBIFile(object): # {{{
pir = self.mobi_header.primary_index_record pir = self.mobi_header.primary_index_record
if pir != 0xffffffff: if pir != 0xffffffff:
self.index_header = IndexHeader(self.records[pir]) self.index_header = IndexHeader(self.records[pir])
self.ctoc = CTOC(self.records[
pir+2:pir+2+self.index_header.num_of_ctoc_blocks],
self.index_header.index_encoding)
self.index_record = IndexRecord(self.records[pir+1]) self.index_record = IndexRecord(self.records[pir+1])
@ -660,6 +691,8 @@ def inspect_mobi(path_or_stream):
with open(os.path.join(ddir, 'index.txt'), 'wb') as out: with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
print(str(f.index_header), file=out) print(str(f.index_header), file=out)
print('\n\n', file=out) print('\n\n', file=out)
print(str(f.ctoc).encode('utf-8'), file=out)
print('\n\n', file=out)
print(str(f.index_record), file=out) print(str(f.index_record), file=out)
print ('Debug data saved to:', ddir) print ('Debug data saved to:', ddir)