From 8ba6341324ad9c95d6cf94a6e3c33e7b47449886 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jul 2011 22:44:15 -0600 Subject: [PATCH] MOBI debug: Document all I've learned about TBS so far --- src/calibre/ebooks/mobi/debug.py | 120 +++++++------- src/calibre/ebooks/mobi/tbs_periodicals.rst | 168 ++++++++++++++++++++ 2 files changed, 232 insertions(+), 56 deletions(-) create mode 100644 src/calibre/ebooks/mobi/tbs_periodicals.rst diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index bfd6a20c07..79f2c3483b 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -927,7 +927,7 @@ class TBSIndexing(object): # {{{ ans.append('Outer Index entry: %d'%(outer >> 3)) arg1, consumed = decint(byts) byts = byts[consumed:] - ans.append('Unknown: %d'%arg1) + ans.append('Unknown (vwi: always 0?): %d'%arg1) if self.doc_type in (257, 259): # Hierarchical periodical byts, a = self.interpret_periodical(tbs_type, byts) ans += a @@ -940,6 +940,36 @@ class TBSIndexing(object): # {{{ def interpret_periodical(self, tbs_type, byts): ans = [] + + def tbs_type_6(byts, psi=None): # {{{ + if psi is None: + # Assume parent section is 1 + psi = self.get_index(1) + if byts: + # byts could be empty + arg, consumed = decint(byts) + byts = byts[consumed:] + flags = (arg & 0b1111) + ai = (arg >> 4) + ans.append(('Article index at start of record or first article' + ' index, relative to parent section (fvwi): %d [%d absolute]'%(ai, + ai+psi.index))) + if flags == 1: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('EOF (vwi: should be 0): %d'%arg) + elif flags == 4: + num = byts[0] + byts = byts[1:] + ans.append('Number of article nodes in the record (byte): %d'%num) + elif flags == 0: + pass + else: + raise ValueError('Unknown flags: %d'%flags) + return byts + + # }}} + if tbs_type == 3: # {{{ if byts: arg2, consumed = decint(byts) @@ -1010,20 +1040,37 @@ class TBSIndexing(object): # {{{ elif tbs_type == 7: # {{{ # This occurs for records that have no section nodes and # whose parent section's index == 1 - ans.append('Unknown: %r'%bytes(byts[:2])) + ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2])) byts = byts[2:] arg, consumed = decint(byts) byts = byts[consumed:] ai = arg >> 4 flags = arg & 0b1111 - num = 1 + ans.append('Article at start of record (fvwi): %d'%ai) if flags == 4: - if not byts: - raise ValueError('Type 7 TBS entry missing article count') num = byts[0] byts = byts[1:] - ans.append('Article at start of record: %d'%ai) - ans.append('Number of articles in record: %d'%num) + ans.append('Number of articles in record (byte): %d'%num) + elif flags == 0: + pass + elif flags == 1: + arg, consumed = decint(byts) + byts = byts[consumed:] + ans.append('EOF (vwi: should be 0): %d'%arg) + else: + raise ValueError('Unknown flags value: %d'%flags) + # }}} + + elif tbs_type == 6: # {{{ + # This is used for records spanned by an article whose parent + # section's index == 1 or for the opening record if it contains the + # periodical start, section 1 start and at least one article. The + # two cases are distinguished by the flags on the article index + # vwi. + unk = byts[0] + byts = byts[1:] + ans.append('Unknown (byte: always 2?): %d'%unk) + byts = tbs_type_6(byts) # }}} elif tbs_type == 2: # {{{ @@ -1034,61 +1081,22 @@ class TBSIndexing(object): # {{{ # whose parent section index > 1. In this case the flags of the # vwi referring to the article at the start # of the record are set to 1 instead of 4. - if byts: - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - psi = (arg >> 4) - ans.append('Parent section index: %d'%psi) - psi = self.get_index(psi) - ans.append('Flags: %d'%flags) - if flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown: %d'%arg) - elif flags == 0: - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = arg & 0b1111 - off = arg >> 4 - ans.append('Article at start of block as offset from ' - 'parent index: %d [%d absolute]'%(off, psi.index+off)) - if flags == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of nodes: %d'%num) - elif flags == 1: - num = byts[0] - byts = byts[1:] - ans.append('EOF: %s'%hex(num)) - else: - raise ValueError('Unknown flag value: %d'%flags) - # }}} - - elif tbs_type == 6: # {{{ - # This is used for records spanned by an article whose parent - # section's index == 1 or for the opening record if it contains the - # periodical start, section 1 start and atleast one article. The - # two cases are distinguidshed by the flags on the article index - # vwi. - unk = byts[0] - byts = byts[1:] - ans.append('Unknown (always 2?): %d'%unk) arg, consumed = decint(byts) byts = byts[consumed:] flags = (arg & 0b1111) - ai = (arg >> 4) - ans.append(('Article index at start of record or first article' - ' index, relative to section 1: %d [%d absolute]'%(ai, ai+1))) + psi = (arg >> 4) + ans.append('Parent section index (fvwi): %d'%psi) + psi = self.get_index(psi) + ans.append('Flags: %d'%flags) if flags == 1: arg, consumed = decint(byts) byts = byts[consumed:] - ans.append('EOF (should be 0): %d'%arg) - elif flags == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of article nodes in the record: %d'%num) - + ans.append('Unknown (vwi?: always 0?): %d'%arg) + byts = tbs_type_6(byts, psi=psi) + elif flags == 0: + byts = tbs_type_6(byts, psi=psi) + else: + raise ValueError('Unkown flags: %d'%flags) # }}} return byts, ans diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst new file mode 100644 index 0000000000..4dbae3f295 --- /dev/null +++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst @@ -0,0 +1,168 @@ +Reverse engineering the trailing byte sequences for hierarchical periodicals +=============================================================================== + +In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. + +Opening record +---------------- + +The text record that contains the opening node for the periodical (depth=0 node in the NCX) can have TBS of 3 different forms: + + 1. If it has only the periodical node and no section/article nodes, TBS of type 2, like this:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 1 index entries (0 ends, 0 complete, 1 starts) + TBS bytes: 82 80 + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader] + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + + 2. A periodical and a section node, but no article nodes, TBS type of 6, like this:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 2 index entries (0 ends, 0 complete, 2 starts) + TBS bytes: 86 80 2 + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 93254) [j_x's Google reader] + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 49280) [Ars Technica] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + + 3. If it has both the section 1 node and at least one article node, TBS of type 6, like this:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 4 index entries (0 ends, 1 complete, 3 starts) + TBS bytes: 86 80 2 c4 2 + Complete: + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 549, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 79253) [j_x's Google reader] + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 35279) [Ars Technica] + Index Entry: 6 (Parent index: 1, Depth: 2, Offset: 2415, Size: 2764) [Week in Apple: ZFS on Mac OS X, rogue tethering, DUI apps, and more] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Number of article nodes in the record (byte): 2 + + If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record. + + +Records with no nodes +------------------------ + +These records are spanned by a single article. They are of two types: + + 1. If the parent section index is 1, TBS type of 6, like this:: + + Record #4: Starts at: 12288 Ends at: 16383 + Contains: 0 index entries (0 ends, 0 complete, 0 starts) + TBS bytes: 86 80 2 c1 80 + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + EOF (vwi: should be 0): 0 + + If the record is before the first article, the TBS bytes would be: 86 80 2 + + 2. If the parent section index is > 1, TBS type of 2, like this:: + + Record #14: Starts at: 53248 Ends at: 57343 + Contains: 0 index entries (0 ends, 0 complete, 0 starts) + TBS bytes: 82 80 a0 1 e1 80 + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Parent section index (fvwi): 2 + Flags: 0 + Article index at start of record or first article index, relative to parent section (fvwi): 14 [16 absolute] + EOF (vwi: should be 0): 0 + +Records with only article nodes +----------------------------------- + +Such records have no section transitions (i.e. a section end/section start pair). They have only one or more article nodes. They are of two types: + + 1. If the parent section index is 1, TBS type of 7, like this:: + + Record #6: Starts at: 20480 Ends at: 24575 + Contains: 2 index entries (1 ends, 0 complete, 1 starts) + TBS bytes: 87 80 2 80 1 84 2 + Ends: + Index Entry: 9 (Parent index: 1, Depth: 2, Offset: 16453, Size: 4199) [Vaccine's success spurs whooping cough comeback] + Starts: + Index Entry: 10 (Parent index: 1, Depth: 2, Offset: 20652, Size: 4246) [Apple's mobile products do not violate Nokia patents, says ITC] + TBS Type: 111 (7) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown: '\x02\x80' (vwi?: Always 256) + Article at start of record (fvwi): 8 + Number of articles in record (byte): 2 + + If there was only one article in the record, the last two bytes would be replaced by a single byte: 80 + + If this record is the first record with an article, then the article at the start of the record should be the last section index. At least, that's what kindlegen does, though if you ask me, it should be the first section index. + + + 2. If the parent section index is > 1, TBS type of 2, like this:: + + Record #16: Starts at: 61440 Ends at: 65535 + Contains: 5 index entries (1 ends, 3 complete, 1 starts) + TBS bytes: 82 80 a1 80 1 f4 5 + Ends: + Index Entry: 17 (Parent index: 2, Depth: 2, Offset: 60920, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware] + Complete: + Index Entry: 18 (Parent index: 2, Depth: 2, Offset: 62002, Size: 1016) [Rumour: OS X Lion nearing Golden Master stage] + Index Entry: 19 (Parent index: 2, Depth: 2, Offset: 63018, Size: 1045) [iOS 4.3.1 released] + Index Entry: 20 (Parent index: 2, Depth: 2, Offset: 64063, Size: 972) [Windows 8 'system reset' image leaks] + Starts: + Index Entry: 21 (Parent index: 2, Depth: 2, Offset: 65035, Size: 1057) [Windows Phone 7: Why it's failing] + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Parent section index (fvwi) : 2 + Flags: 1 + Unknown (vwi: always 0?): 0 + Article index at start of record or first article index, relative to parent section (fvwi): 15 [17 absolute] + Number of article nodes in the record (byte): 5 + + If there was only one article in the record, the last two bytes would be replaced by a single byte: f0 + +Records with a section transition +----------------------------------- + +In such a record there is a transition from one section to the next. As such the record must have at least one article ending and one article starting. + +TODO: Note you have to test the case of a single transition and multiple transitions + +Ending record +---------------- + +Logically, ending records must have at least one article ending, one section ending and the periodical ending. They are of TBS type 2, like this:: + + Record #17: Starts at: 65536 Ends at: 68684 + Contains: 4 index entries (3 ends, 1 complete, 0 starts) + TBS bytes: 82 80 c0 4 f4 2 + Ends: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader] + Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 51234, Size: 17451) [Slashdot] + Index Entry: 43 (Parent index: 4, Depth: 2, Offset: 65422, Size: 1717) [US ITC May Reverse Judge's Ruling In Kodak vs. Apple] + Complete: + Index Entry: 44 (Parent index: 4, Depth: 2, Offset: 67139, Size: 1546) [Google Starts Testing Google Music Internally] + TBS Type: 010 (2) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Parent section index (fvwi): 4 + Flags: 0 + Article at start of block as offset from parent index (fvwi): 39 [43 absolute] + Number of nodes (byte): 2 + +If the record had only a single article end, the last two bytes would be replaced with: f0 +