MOBI debug: Document all I've learned about TBS so far

This commit is contained in:
Kovid Goyal 2011-07-21 22:44:15 -06:00
parent 35ce4aecae
commit 8ba6341324
2 changed files with 232 additions and 56 deletions

View File

@ -927,7 +927,7 @@ class TBSIndexing(object): # {{{
ans.append('Outer Index entry: %d'%(outer >> 3)) ans.append('Outer Index entry: %d'%(outer >> 3))
arg1, consumed = decint(byts) arg1, consumed = decint(byts)
byts = byts[consumed:] byts = byts[consumed:]
ans.append('Unknown: %d'%arg1) ans.append('Unknown (vwi: always 0?): %d'%arg1)
if self.doc_type in (257, 259): # Hierarchical periodical if self.doc_type in (257, 259): # Hierarchical periodical
byts, a = self.interpret_periodical(tbs_type, byts) byts, a = self.interpret_periodical(tbs_type, byts)
ans += a ans += a
@ -940,6 +940,36 @@ class TBSIndexing(object): # {{{
def interpret_periodical(self, tbs_type, byts): def interpret_periodical(self, tbs_type, byts):
ans = [] ans = []
def tbs_type_6(byts, psi=None): # {{{
if psi is None:
# Assume parent section is 1
psi = self.get_index(1)
if byts:
# byts could be empty
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = (arg & 0b1111)
ai = (arg >> 4)
ans.append(('Article index at start of record or first article'
' index, relative to parent section (fvwi): %d [%d absolute]'%(ai,
ai+psi.index)))
if flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('EOF (vwi: should be 0): %d'%arg)
elif flags == 4:
num = byts[0]
byts = byts[1:]
ans.append('Number of article nodes in the record (byte): %d'%num)
elif flags == 0:
pass
else:
raise ValueError('Unknown flags: %d'%flags)
return byts
# }}}
if tbs_type == 3: # {{{ if tbs_type == 3: # {{{
if byts: if byts:
arg2, consumed = decint(byts) arg2, consumed = decint(byts)
@ -1010,20 +1040,37 @@ class TBSIndexing(object): # {{{
elif tbs_type == 7: # {{{ elif tbs_type == 7: # {{{
# This occurs for records that have no section nodes and # This occurs for records that have no section nodes and
# whose parent section's index == 1 # whose parent section's index == 1
ans.append('Unknown: %r'%bytes(byts[:2])) ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2]))
byts = byts[2:] byts = byts[2:]
arg, consumed = decint(byts) arg, consumed = decint(byts)
byts = byts[consumed:] byts = byts[consumed:]
ai = arg >> 4 ai = arg >> 4
flags = arg & 0b1111 flags = arg & 0b1111
num = 1 ans.append('Article at start of record (fvwi): %d'%ai)
if flags == 4: if flags == 4:
if not byts:
raise ValueError('Type 7 TBS entry missing article count')
num = byts[0] num = byts[0]
byts = byts[1:] byts = byts[1:]
ans.append('Article at start of record: %d'%ai) ans.append('Number of articles in record (byte): %d'%num)
ans.append('Number of articles in record: %d'%num) elif flags == 0:
pass
elif flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('EOF (vwi: should be 0): %d'%arg)
else:
raise ValueError('Unknown flags value: %d'%flags)
# }}}
elif tbs_type == 6: # {{{
# This is used for records spanned by an article whose parent
# section's index == 1 or for the opening record if it contains the
# periodical start, section 1 start and at least one article. The
# two cases are distinguished by the flags on the article index
# vwi.
unk = byts[0]
byts = byts[1:]
ans.append('Unknown (byte: always 2?): %d'%unk)
byts = tbs_type_6(byts)
# }}} # }}}
elif tbs_type == 2: # {{{ elif tbs_type == 2: # {{{
@ -1034,61 +1081,22 @@ class TBSIndexing(object): # {{{
# whose parent section index > 1. In this case the flags of the # whose parent section index > 1. In this case the flags of the
# vwi referring to the article at the start # vwi referring to the article at the start
# of the record are set to 1 instead of 4. # of the record are set to 1 instead of 4.
if byts:
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = (arg & 0b1111)
psi = (arg >> 4)
ans.append('Parent section index: %d'%psi)
psi = self.get_index(psi)
ans.append('Flags: %d'%flags)
if flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('Unknown: %d'%arg)
elif flags == 0:
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = arg & 0b1111
off = arg >> 4
ans.append('Article at start of block as offset from '
'parent index: %d [%d absolute]'%(off, psi.index+off))
if flags == 4:
num = byts[0]
byts = byts[1:]
ans.append('Number of nodes: %d'%num)
elif flags == 1:
num = byts[0]
byts = byts[1:]
ans.append('EOF: %s'%hex(num))
else:
raise ValueError('Unknown flag value: %d'%flags)
# }}}
elif tbs_type == 6: # {{{
# This is used for records spanned by an article whose parent
# section's index == 1 or for the opening record if it contains the
# periodical start, section 1 start and atleast one article. The
# two cases are distinguidshed by the flags on the article index
# vwi.
unk = byts[0]
byts = byts[1:]
ans.append('Unknown (always 2?): %d'%unk)
arg, consumed = decint(byts) arg, consumed = decint(byts)
byts = byts[consumed:] byts = byts[consumed:]
flags = (arg & 0b1111) flags = (arg & 0b1111)
ai = (arg >> 4) psi = (arg >> 4)
ans.append(('Article index at start of record or first article' ans.append('Parent section index (fvwi): %d'%psi)
' index, relative to section 1: %d [%d absolute]'%(ai, ai+1))) psi = self.get_index(psi)
ans.append('Flags: %d'%flags)
if flags == 1: if flags == 1:
arg, consumed = decint(byts) arg, consumed = decint(byts)
byts = byts[consumed:] byts = byts[consumed:]
ans.append('EOF (should be 0): %d'%arg) ans.append('Unknown (vwi?: always 0?): %d'%arg)
elif flags == 4: byts = tbs_type_6(byts, psi=psi)
num = byts[0] elif flags == 0:
byts = byts[1:] byts = tbs_type_6(byts, psi=psi)
ans.append('Number of article nodes in the record: %d'%num) else:
raise ValueError('Unkown flags: %d'%flags)
# }}} # }}}
return byts, ans return byts, ans

View File

@ -0,0 +1,168 @@
Reverse engineering the trailing byte sequences for hierarchical periodicals
===============================================================================
In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag.
Opening record
----------------
The text record that contains the opening node for the periodical (depth=0 node in the NCX) can have TBS of 3 different forms:
1. If it has only the periodical node and no section/article nodes, TBS of type 2, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 1 index entries (0 ends, 0 complete, 1 starts)
TBS bytes: 82 80
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
2. A periodical and a section node, but no article nodes, TBS type of 6, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 2 index entries (0 ends, 0 complete, 2 starts)
TBS bytes: 86 80 2
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 93254) [j_x's Google reader]
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 49280) [Ars Technica]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
3. If it has both the section 1 node and at least one article node, TBS of type 6, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 4 index entries (0 ends, 1 complete, 3 starts)
TBS bytes: 86 80 2 c4 2
Complete:
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 549, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz]
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 79253) [j_x's Google reader]
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 35279) [Ars Technica]
Index Entry: 6 (Parent index: 1, Depth: 2, Offset: 2415, Size: 2764) [Week in Apple: ZFS on Mac OS X, rogue tethering, DUI apps, and more]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
Number of article nodes in the record (byte): 2
If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record.
Records with no nodes
------------------------
These records are spanned by a single article. They are of two types:
1. If the parent section index is 1, TBS type of 6, like this::
Record #4: Starts at: 12288 Ends at: 16383
Contains: 0 index entries (0 ends, 0 complete, 0 starts)
TBS bytes: 86 80 2 c1 80
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
EOF (vwi: should be 0): 0
If the record is before the first article, the TBS bytes would be: 86 80 2
2. If the parent section index is > 1, TBS type of 2, like this::
Record #14: Starts at: 53248 Ends at: 57343
Contains: 0 index entries (0 ends, 0 complete, 0 starts)
TBS bytes: 82 80 a0 1 e1 80
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi): 2
Flags: 0
Article index at start of record or first article index, relative to parent section (fvwi): 14 [16 absolute]
EOF (vwi: should be 0): 0
Records with only article nodes
-----------------------------------
Such records have no section transitions (i.e. a section end/section start pair). They have only one or more article nodes. They are of two types:
1. If the parent section index is 1, TBS type of 7, like this::
Record #6: Starts at: 20480 Ends at: 24575
Contains: 2 index entries (1 ends, 0 complete, 1 starts)
TBS bytes: 87 80 2 80 1 84 2
Ends:
Index Entry: 9 (Parent index: 1, Depth: 2, Offset: 16453, Size: 4199) [Vaccine's success spurs whooping cough comeback]
Starts:
Index Entry: 10 (Parent index: 1, Depth: 2, Offset: 20652, Size: 4246) [Apple's mobile products do not violate Nokia patents, says ITC]
TBS Type: 111 (7)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown: '\x02\x80' (vwi?: Always 256)
Article at start of record (fvwi): 8
Number of articles in record (byte): 2
If there was only one article in the record, the last two bytes would be replaced by a single byte: 80
If this record is the first record with an article, then the article at the start of the record should be the last section index. At least, that's what kindlegen does, though if you ask me, it should be the first section index.
2. If the parent section index is > 1, TBS type of 2, like this::
Record #16: Starts at: 61440 Ends at: 65535
Contains: 5 index entries (1 ends, 3 complete, 1 starts)
TBS bytes: 82 80 a1 80 1 f4 5
Ends:
Index Entry: 17 (Parent index: 2, Depth: 2, Offset: 60920, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
Complete:
Index Entry: 18 (Parent index: 2, Depth: 2, Offset: 62002, Size: 1016) [Rumour: OS X Lion nearing Golden Master stage]
Index Entry: 19 (Parent index: 2, Depth: 2, Offset: 63018, Size: 1045) [iOS 4.3.1 released]
Index Entry: 20 (Parent index: 2, Depth: 2, Offset: 64063, Size: 972) [Windows 8 'system reset' image leaks]
Starts:
Index Entry: 21 (Parent index: 2, Depth: 2, Offset: 65035, Size: 1057) [Windows Phone 7: Why it's failing]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi) : 2
Flags: 1
Unknown (vwi: always 0?): 0
Article index at start of record or first article index, relative to parent section (fvwi): 15 [17 absolute]
Number of article nodes in the record (byte): 5
If there was only one article in the record, the last two bytes would be replaced by a single byte: f0
Records with a section transition
-----------------------------------
In such a record there is a transition from one section to the next. As such the record must have at least one article ending and one article starting.
TODO: Note you have to test the case of a single transition and multiple transitions
Ending record
----------------
Logically, ending records must have at least one article ending, one section ending and the periodical ending. They are of TBS type 2, like this::
Record #17: Starts at: 65536 Ends at: 68684
Contains: 4 index entries (3 ends, 1 complete, 0 starts)
TBS bytes: 82 80 c0 4 f4 2
Ends:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader]
Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 51234, Size: 17451) [Slashdot]
Index Entry: 43 (Parent index: 4, Depth: 2, Offset: 65422, Size: 1717) [US ITC May Reverse Judge's Ruling In Kodak vs. Apple]
Complete:
Index Entry: 44 (Parent index: 4, Depth: 2, Offset: 67139, Size: 1546) [Google Starts Testing Google Music Internally]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi): 4
Flags: 0
Article at start of block as offset from parent index (fvwi): 39 [43 absolute]
Number of nodes (byte): 2
If the record had only a single article end, the last two bytes would be replaced with: f0