MOBI debug: Lots of progress on decoding the TBS sequences for hierarchical periodicals

This commit is contained in:
Kovid Goyal 2011-07-21 21:02:11 -06:00
parent 7213f7b5a0
commit 35ce4aecae
2 changed files with 191 additions and 7 deletions

View File

@ -653,11 +653,28 @@ class IndexEntry(object): # {{{
return tag.value return tag.value
return -1 return -1
@property
def first_child_index(self):
for tag in self.tags:
if tag.attr == 'first_child_index':
return tag.value
return -1
@property
def last_child_index(self):
for tag in self.tags:
if tag.attr == 'last_child_index':
return tag.value
return -1
def __str__(self): def __str__(self):
ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
self.index, self.entry_type, len(self.tags))] self.index, self.entry_type, len(self.tags))]
for tag in self.tags: for tag in self.tags:
ans.append('\t'+str(tag)) ans.append('\t'+str(tag))
if self.first_child_index != -1:
ans.append('\tNumber of children: %d'%(self.last_child_index -
self.first_child_index + 1))
return '\n'.join(ans) return '\n'.join(ans)
# }}} # }}}
@ -832,8 +849,10 @@ class BinaryRecord(object): # {{{
class TBSIndexing(object): # {{{ class TBSIndexing(object): # {{{
def __init__(self, text_records, indices): def __init__(self, text_records, indices, doc_type):
self.record_indices = OrderedDict() self.record_indices = OrderedDict()
self.doc_type = doc_type
self.indices = indices
pos = 0 pos = 0
for r in text_records: for r in text_records:
start = pos start = pos
@ -856,6 +875,11 @@ class TBSIndexing(object): # {{{
if rec: if rec:
x[rec].append(entry) x[rec].append(entry)
def get_index(self, idx):
for i in self.indices:
if i.index == idx: return i
raise IndexError('Index %d not found'%idx)
def __str__(self): def __str__(self):
ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20] ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20]
for r, dat in self.record_indices.iteritems(): for r, dat in self.record_indices.iteritems():
@ -896,14 +920,17 @@ class TBSIndexing(object): # {{{
tbs_type = 0 tbs_type = 0
if len(byts): if len(byts):
outer, consumed = decint(bytes(byts)) outer, consumed = decint(byts)
byts = byts[consumed:] byts = byts[consumed:]
tbs_type = outer & 0b111 tbs_type = outer & 0b111
ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type)) ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type))
ans.append('Outer Index entry: %d'%(outer >> 3)) ans.append('Outer Index entry: %d'%(outer >> 3))
arg1, consumed = decint(bytes(byts)) arg1, consumed = decint(byts)
byts = byts[consumed:] byts = byts[consumed:]
ans.append('Unknown: %d'%arg1) ans.append('Unknown: %d'%arg1)
if self.doc_type in (257, 259): # Hierarchical periodical
byts, a = self.interpret_periodical(tbs_type, byts)
ans += a
if byts: if byts:
sbyts = tuple(hex(b)[2:] for b in byts) sbyts = tuple(hex(b)[2:] for b in byts)
ans.append('Remaining bytes: %s'%' '.join(sbyts)) ans.append('Remaining bytes: %s'%' '.join(sbyts))
@ -911,6 +938,161 @@ class TBSIndexing(object): # {{{
ans.append('') ans.append('')
return tbs_type, ans return tbs_type, ans
def interpret_periodical(self, tbs_type, byts):
ans = []
if tbs_type == 3: # {{{
if byts:
arg2, consumed = decint(byts)
byts = byts[consumed:]
ans.append('Unknown: %d'%arg2)
if byts:
arg3, consumed = decint(byts)
byts = byts[consumed:]
fsi = arg3 >> 4
extra = arg3 & 0b1111
ans.append('First section index: %d'%fsi)
psi = self.get_index(fsi)
ans.append('Extra bits: %d'%extra)
if byts:
if byts[0] == fsi:
ssi = psi.index+1
ans.append('First section ends')
byts = byts[1:]
arg, consumed = decint(byts)
raw = byts[:consumed]
byts = byts[consumed:]
flags = arg & 0b1111
ans.append('Unknown (art index at start of record?):'
' %d %r'%((arg>>4), raw))
ans.append('Flags: %d'%flags)
num = 1
if flags >= 4:
num = byts[0]
byts = byts[1:]
ans.append('Number of articles in closing section: %d'%num)
if flags == 5:
arg, consumed = decint(byts)
ans.append('Unknown: %r'%bytes(byts[:consumed]))
byts = byts[consumed:]
arg, consumed = decint(byts)
byts = byts[consumed:]
off = arg >> 4
ans.append('Last article of ending section w.r.t. starting'
' section offset: %d [%d absolute]'%(off,
ssi+off))
ans.append('Extra bits: %d'%(arg & 0b1111))
arg, consumed = decint(byts)
byts = byts[consumed:]
off = arg >> 4
flag = arg & 0b1111
ans.append('Offset to first article of starting section: %d'
' [%d absolute]'%(off, ssi+off))
ans.append('Flags: %d'%flag)
num = 1
if flag == 4:
num = byts[0]
byts = byts[1:]
ans.append('Number of articles in starting section: %d'%num)
else:
ans.append('First section starts')
off, consumed = decint(byts)
flags = off & 0b1111
off = off >> 4
byts = byts[consumed:]
ans.append('Article at start of block as offset from '
'parent index: %d [%d absolute]'%(off, psi.index+off))
ans.append('Flags: %d'%flags)
if flags == 4:
ans.append('Number of articles: %d'%byts[0])
byts = byts[1:]
# }}}
elif tbs_type == 7: # {{{
# This occurs for records that have no section nodes and
# whose parent section's index == 1
ans.append('Unknown: %r'%bytes(byts[:2]))
byts = byts[2:]
arg, consumed = decint(byts)
byts = byts[consumed:]
ai = arg >> 4
flags = arg & 0b1111
num = 1
if flags == 4:
if not byts:
raise ValueError('Type 7 TBS entry missing article count')
num = byts[0]
byts = byts[1:]
ans.append('Article at start of record: %d'%ai)
ans.append('Number of articles in record: %d'%num)
# }}}
elif tbs_type == 2: # {{{
# This occurs for records with no section nodes and whose parent
# section's index != 1 (undefined (records before the first
# section) or > 1)
# This is also used for records that are spanned by an article
# whose parent section index > 1. In this case the flags of the
# vwi referring to the article at the start
# of the record are set to 1 instead of 4.
if byts:
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = (arg & 0b1111)
psi = (arg >> 4)
ans.append('Parent section index: %d'%psi)
psi = self.get_index(psi)
ans.append('Flags: %d'%flags)
if flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('Unknown: %d'%arg)
elif flags == 0:
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = arg & 0b1111
off = arg >> 4
ans.append('Article at start of block as offset from '
'parent index: %d [%d absolute]'%(off, psi.index+off))
if flags == 4:
num = byts[0]
byts = byts[1:]
ans.append('Number of nodes: %d'%num)
elif flags == 1:
num = byts[0]
byts = byts[1:]
ans.append('EOF: %s'%hex(num))
else:
raise ValueError('Unknown flag value: %d'%flags)
# }}}
elif tbs_type == 6: # {{{
# This is used for records spanned by an article whose parent
# section's index == 1 or for the opening record if it contains the
# periodical start, section 1 start and atleast one article. The
# two cases are distinguidshed by the flags on the article index
# vwi.
unk = byts[0]
byts = byts[1:]
ans.append('Unknown (always 2?): %d'%unk)
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = (arg & 0b1111)
ai = (arg >> 4)
ans.append(('Article index at start of record or first article'
' index, relative to section 1: %d [%d absolute]'%(ai, ai+1)))
if flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('EOF (should be 0): %d'%arg)
elif flags == 4:
num = byts[0]
byts = byts[1:]
ans.append('Number of article nodes in the record: %d'%num)
# }}}
return byts, ans
# }}} # }}}
class MOBIFile(object): # {{{ class MOBIFile(object): # {{{
@ -996,7 +1178,7 @@ class MOBIFile(object): # {{{
if self.index_record is not None: if self.index_record is not None:
self.tbs_indexing = TBSIndexing(self.text_records, self.tbs_indexing = TBSIndexing(self.text_records,
self.index_record.indices) self.index_record.indices, self.mobi_header.type_raw)
def print_header(self, f=sys.stdout): def print_header(self, f=sys.stdout):
print (str(self.palmdb).encode('utf-8'), file=f) print (str(self.palmdb).encode('utf-8'), file=f)

View File

@ -79,7 +79,7 @@ def encint(value, forward=True):
def decint(raw, forward=True): def decint(raw, forward=True):
''' '''
Read a variable width integer from the bytestring raw and return the Read a variable width integer from the bytestring or bytearray raw and return the
integer and the number of bytes read. If forward is True bytes are read integer and the number of bytes read. If forward is True bytes are read
from the start of raw, otherwise from the end of raw. from the start of raw, otherwise from the end of raw.
@ -88,8 +88,10 @@ def decint(raw, forward=True):
''' '''
val = 0 val = 0
byts = bytearray() byts = bytearray()
for byte in raw if forward else reversed(raw): src = bytearray(raw)
bnum = ord(byte) if not forward:
src.reverse()
for bnum in src:
byts.append(bnum & 0b01111111) byts.append(bnum & 0b01111111)
if bnum & 0b10000000: if bnum & 0b10000000:
break break