KF8 Output: Proper algorithm for calculatin trailing bytes for all types of ToCs

2025-07-09 03:04:10 -04:00 · 2012-04-28 22:17:42 +05:30 · 2012-04-28 22:17:42 +05:30 · 89e01414c5
commit 89e01414c5
parent f4e6d943ee
5 changed files with 206 additions and 106 deletions
--- a/src/calibre/ebooks/mobi/debug/headers.py
+++ b/src/calibre/ebooks/mobi/debug/headers.py
@ -549,6 +549,9 @@ class TextRecord(object): # {{{
                raw = '%s : %r\n\n'%(k, v)
                f.write(raw.encode('utf-8'))

+    def __len__(self):
+        return len(self.raw)
+
 # }}}


--- a/src/calibre/ebooks/mobi/debug/index.py
+++ b/src/calibre/ebooks/mobi/debug/index.py
@ -158,9 +158,12 @@ class NCXIndex(Index):
        self.records = []

        if self.table is not None:
+            NCXEntry = namedtuple('NCXEntry', 'index start length depth parent '
+        'first_child last_child title pos_fid')
+
            for num, x in enumerate(self.table.iteritems()):
                text, tag_map = x
-                entry = default_entry.copy()
+                entry = e = default_entry.copy()
                entry['name'] = text
                entry['num'] = num

@ -179,7 +182,17 @@ class NCXIndex(Index):
                            if tag == which:
                                entry[name] = self.cncx.get(fieldvalue,
                                        default_entry[name])
+                def refindx(e, name):
+                    ans = e[name]
+                    if ans < 0:
+                        ans = None
+                    return ans
+
+                entry = NCXEntry(start=e['pos'], index=e['num'],
+                        length=e['len'], depth=e['hlvl'], parent=refindx(e,
+                            'parent'), first_child=refindx(e, 'child1'),
+                        last_child=refindx(e, 'childn'), title=e['text'],
+                        pos_fid=e['pos_fid'])
                self.records.append(entry)


-
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@ -2,19 +2,20 @@
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
+from future_builtins import map

 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import sys, os, imghdr, struct
+import sys, os, imghdr, struct, textwrap
 from itertools import izip

 from calibre import CurrentDir
 from calibre.ebooks.mobi.debug.headers import TextRecord
 from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
        GuideIndex)
-from calibre.ebooks.mobi.utils import read_font_record, decode_tbs
+from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
 from calibre.ebooks.mobi.debug import format_bytes
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX

@ -185,38 +186,42 @@ class MOBIFile(object):
                payload))

    def read_tbs(self):
-        from calibre.ebooks.mobi.writer8.tbs import (Entry,
-                collect_indexing_data)
+        from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC,
+                collect_indexing_data, encode_strands_as_sequences,
+                sequences_to_bytes)
        entry_map = []
        for index in self.ncx_index:
-            enders = [e['pos'] for e in self.ncx_index if e['pos'] >
-                    index['pos'] and
-                    e['hlvl'] <= index['hlvl']]
-            end = min(enders+[len(self.raw_text)])
+            vals = list(index)[:-1] + [None, None, None, None]
+            entry_map.append(Entry(*vals))

-            entry_map.append(Entry(index=index['num'], title=index['text'],
-                depth=index['hlvl'],
-                parent=index['parent'] if index['parent'] > -1 else None,
-                first_child=index['child1'] if index['child1'] > -1 else None,
-                last_child=index['childn'] if index['childn'] > -1 else None,
-                start=index['pos'], length=end-index['pos']))

-        indexing_data = collect_indexing_data(entry_map,
-                len(self.text_records))
-        self.indexing_data = []
-        for i, data in enumerate(indexing_data):
+        indexing_data = collect_indexing_data(entry_map, list(map(len,
+            self.text_records)))
+        self.indexing_data = [DOC + '\n' +textwrap.dedent('''\
+                Index Entry lines are of the form:
+                depth:index_number [action] parent (index_num-parent) Geometry
+
+                Where Geometry is the start and end of the index entry w.r.t
+                the start of the text record.
+
+                ''')]
+        for i, strands in enumerate(indexing_data):
            rec = self.text_records[i]
            tbs_bytes = rec.trailing_data.get('indexing', b'')
            desc = ['Record #%d'%i]
-            for x in ('starts', 'completes', 'ends', 'spans'):
-                points = ['\t%d at depth: %d'%(e.index, e.depth) for e in
-                    getattr(data, x)]
-                if points:
-                    desc.append(x+':')
-                    desc.extend(points)
+            for s, strand in enumerate(strands):
+                desc.append('Strand %d'%s)
+                for entries in strand.itervalues():
+                    for e in entries:
+                        desc.append(
+                        ' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%(
+                            e.depth * ('  ') + '- ', e.index, e.action, e.parent,
+                            e.index-(e.parent or 0), e.start-i*RECORD_SIZE,
+                            e.start+e.length-i*RECORD_SIZE))
            desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
            flag_sz = 3
            sequences = []
+            otbs = tbs_bytes
            while tbs_bytes:
                try:
                    val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
@ -226,10 +231,16 @@ class MOBIFile(object):
                tbs_bytes = tbs_bytes[consumed:]
                extra = {bin(k):v for k, v in extra.iteritems()}
                sequences.append((val, extra))
-            for i, seq in enumerate(sequences):
-                desc.append('Sequence #%d: %r %r'%(i, seq[0], seq[1]))
+            for j, seq in enumerate(sequences):
+                desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1]))
            if tbs_bytes:
                desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
+            calculated_sequences = encode_strands_as_sequences(strands)
+            calculated_bytes = sequences_to_bytes(calculated_sequences)
+            if calculated_bytes != otbs:
+                print ('WARNING: TBS mismatch for record %d'%i)
+                desc.append('WARNING: TBS mismatch!')
+                desc.append('Calculated sequences: %r'%calculated_sequences)
            desc.append('')
            self.indexing_data.append('\n'.join(desc))

@ -242,8 +253,7 @@ def inspect_mobi(mobi_file, ddir):
    with open(alltext, 'wb') as of:
        of.write(f.raw_text)

-    for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows',
-            'tbs'):
+    for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'):
        os.mkdir(os.path.join(ddir, x))

    for rec in f.text_records:
@ -269,7 +279,7 @@ def inspect_mobi(mobi_file, ddir):
    with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
        fo.write(str(f.guide_index).encode('utf-8'))

-    with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo:
+    with open(os.path.join(ddir, 'tbs.txt'), 'wb') as fo:
        fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))

    for part in f.files:
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -264,12 +264,14 @@ class KF8Writer(object):
        text = BytesIO(text)
        nrecords = 0
        records_size = 0
+        self.uncompressed_record_lengths = []

        if self.compress:
            self.oeb.logger.info('\tCompressing markup...')

        while text.tell() < self.text_length:
            data, overlap = create_text_record(text)
+            self.uncompressed_record_lengths.append(len(data))
            if self.compress:
                data = compress_doc(data)

@ -372,7 +374,7 @@ class KF8Writer(object):
            entry['length'] = get_next_start(entry) - entry['offset']

        self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
-                self.last_text_record_idx+1)
+                self.uncompressed_record_lengths)
        self.ncx_records = NCXIndex(entries)()

    def create_guide(self):
--- a/src/calibre/ebooks/mobi/writer8/tbs.py
+++ b/src/calibre/ebooks/mobi/writer8/tbs.py
@ -7,103 +7,175 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-from collections import namedtuple
-from functools import partial
+DOC = '''
+Trailing Byte Sequences contain information about which index entries touch a
+particular text record. Every text records has a set of trailing byte
+sequences. In order to figure out the sequence for a given text record, you
+have to first calculate all the indices that start, end, span and anre
+contained within that text record. Then arrange the indices into 'strands',
+where each strand is a hierarchical progression from the top level index down.
+For the exact algorithm, see separate_strands(). The strands are then encoded
+into 'sequences', see encode_strands_as_sequences() and finally the sequences
+are turned into bytes.
+'''
+from collections import namedtuple, OrderedDict
+from operator import attrgetter

-from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data,
+from calibre.ebooks.mobi.utils import (encode_trailing_data,
        encode_tbs)

 Entry = namedtuple('IndexEntry', 'index start length depth parent '
-        'first_child last_child title')
-Data = namedtuple('Data', 'starts ends completes spans')
+        'first_child last_child title action start_offset length_offset '
+        'text_record_length')

-def collect_indexing_data(entries, number_of_text_records):
+def fill_entry(entry, start_offset, text_record_length):
+    length_offset = start_offset + entry.length
+    if start_offset < 0:
+        action = 'spans' if length_offset > text_record_length else 'ends'
+    else:
+        action = 'starts' if length_offset > text_record_length else 'completes'
+
+    return Entry(*(entry[:-4] + (action, start_offset, length_offset,
+        text_record_length)))
+
+def populate_strand(parent, entries):
+    ans = [parent]
+    children = [c for c in entries if c.parent == parent.index]
+    if children:
+        # Add first child to this strand, and recurse downwards
+        child = children[0]
+        entries.remove(child)
+        ans += populate_strand(child, entries)
+    else:
+        # Add any entries at the same depth that form a contiguous set of
+        # indices and belong to the same parent (these can all be
+        # represented as a single sequence with the 0b100 flag)
+        current_index = parent.index
+        siblings = []
+        for entry in list(entries):
+            if (entry.depth == parent.depth and entry.parent == parent.parent
+                    and entry.index == current_index+1):
+                current_index += 1
+                entries.remove(entry)
+                children = [c for c in entries if c.parent == entry.index]
+                if children:
+                    siblings += populate_strand(entry, entries)
+                    break # Cannot add more siblings, as we have added children
+                else:
+                    siblings.append(entry)
+        ans += siblings
+    return ans
+
+def separate_strands(entries):
+    ans = []
+    while entries:
+        top, entries = entries[0], entries[1:]
+        strand = populate_strand(top, entries)
+        layers = OrderedDict()
+        for entry in strand:
+            if entry.depth not in layers:
+                layers[entry.depth] = []
+            layers[entry.depth].append(entry)
+        ans.append(layers)
+    return ans
+
+def collect_indexing_data(entries, text_record_lengths):
    ''' For every text record calculate which index entries start, end, span or
-    are contained within that record.'''
+    are contained within that record. Arrange these entries in 'strands'. '''

    data = []
-    for i in xrange(number_of_text_records):
-        record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE
-        datum = Data([], [], [], [])
-        data.append(datum)
+    entries = sorted(entries, key=attrgetter('start'))
+    record_start = 0
+    for rec_length in text_record_lengths:
+        next_record_start = record_start + rec_length
+        local_entries = []

        for entry in entries:
-            end = entry.start + entry.length - 1
-            if (entry.start >= next_record_start or end < record_start):
-                # This entry does not have any overlap with this record
+            if entry.start >= next_record_start:
+                # No more entries overlap this record
+                break
+            if entry.start + entry.length <= record_start:
+                # This entry does not touch this record
                continue
-            if (entry.start < record_start and end >= next_record_start):
-                # This entry spans this record
-                datum.spans.append(entry)
-                continue
-            if (entry.start >= record_start and end < next_record_start):
-                # This entry is contained in this record
-                datum.completes.append(entry)
-            if (entry.start >= record_start and end >= next_record_start):
-                # This entry starts in this record
-                datum.starts.append(entry)
-                continue
-            if (entry.start < record_start and end < next_record_start):
-                # This entry ends in this record
-                datum.ends.append(entry)
+            local_entries.append(fill_entry(entry, entry.start - record_start,
+                rec_length))

-        for x in datum:
-            # Should be unnecessary as entries are already in this order, but
-            # best to be safe.
-            x.sort(key=lambda x:x.depth)
+        strands = separate_strands(local_entries)
+        data.append(strands)
+        record_start += rec_length

    return data

-def generate_tbs_for_flat_index(indexing_data):
+def encode_strands_as_sequences(strands, tbs_type=8):
+    ''' Encode the list of strands for a single text record into a list of
+    sequences, ready to be converted into TBS bytes.    '''
    ans = []
-    record_type = 8 # 8 for KF8 0 for MOBI 6
-    enc = partial(encode_tbs, flag_size=3)
-    for datum in indexing_data:
-        tbs = b''
-        extra = {0b010 : record_type}
-        if not (datum.starts or datum.ends or datum.completes or datum.spans):
-            # No index entry touches this record
-            pass
-        elif datum.spans:
-            extra[0b001] = 0
-            tbs = enc(datum.spans[0].index, extra)
-        else:
-            starts, ends, completes = datum[:3]
-            if (not completes and len(starts) + len(ends) == 1):
-                # Either has the first or the last index, and no other indices.
-                node = (starts+ends)[0]
-                tbs = enc(node.index, extra)
-            else:
-                # This record contains the end of an index and
-                # some complete index entries. Or it contains some complete
-                # entries and a start. Or it contains an end, a start and
-                # optionally some completes. In every case, we encode the first
-                # entry to touch this record and the number of entries
-                # that touch this record.
-                nodes = starts + completes + ends
-                nodes.sort(key=lambda x:x.index)
-                extra[0b100] = len(nodes)
-                tbs = enc(nodes[0].index, extra)
-        ans.append(tbs)
+    last_index = None
+    max_length_offset = 0
+    first_entry = None
+    for strand in strands:
+        for entries in strand.itervalues():
+            for entry in entries:
+                if first_entry is None:
+                    first_entry = entry
+                if entry.length_offset > max_length_offset:
+                    max_length_offset = entry.length_offset
+
+    for strand in strands:
+        strand_seqs = []
+        for depth, entries in strand.iteritems():
+            extra = {}
+            if entries[-1].action == 'spans':
+                extra[0b1] = 0
+            elif False and (
+                    entries[-1].length_offset < entries[-1].text_record_length and
+                    entries[-1].action == 'completes' and
+                    entries[-1].length_offset != max_length_offset):
+                # I can't figure out exactly when kindlegen decides to insert
+                # this
+                extra[0b1] = entries[-1].length_offset
+
+            if entries[0] is first_entry:
+                extra[0b10] = tbs_type
+
+            if len(entries) > 1:
+                extra[0b100] = len(entries)
+
+            index = entries[0].index - (entries[0].parent or 0)
+            if ans and not strand_seqs:
+                extra[0b1000] = True
+                index = last_index - entries[0].index
+            last_index = entries[-1].index
+            strand_seqs.append((index, extra))
+
+        # Handle the case of consecutive action == 'spans' entries
+        for i, seq in enumerate(strand_seqs):
+            if i + 1 < len(strand_seqs):
+                if 0b1 in seq[1] and 0b1 in strand_seqs[i+1][1]:
+                    del seq[1][0b1]
+        ans.extend(strand_seqs)

    return ans

-def apply_trailing_byte_sequences(index_table, records, number_of_text_records):
+def sequences_to_bytes(sequences):
+    ans = []
+    flag_size = 3
+    for val, extra in sequences:
+        ans.append(encode_tbs(val, extra, flag_size))
+        flag_size = 4
+    return b''.join(ans)
+
+def apply_trailing_byte_sequences(index_table, records, text_record_lengths):
    entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'],
        r.get('parent', None), r.get('first_child', None), r.get('last_child',
-            None), r['label']) for r in index_table)
+            None), r['label'], None, None, None, None) for r in index_table)

-    indexing_data = collect_indexing_data(entries, number_of_text_records)
-    max_depth = max(e['depth'] for e in index_table)
-    if max_depth > 0:
-        # TODO: Implement for hierarchical ToCs
-        tbs = []
-    else:
-        tbs = generate_tbs_for_flat_index(indexing_data)
-    if not tbs:
-        return False
-    for i, tbs_bytes in enumerate(tbs):
+    indexing_data = collect_indexing_data(entries, text_record_lengths)
+    for i, strands in enumerate(indexing_data):
+        sequences = encode_strands_as_sequences(strands)
+        tbs_bytes = sequences_to_bytes(sequences)
        records[i+1] += encode_trailing_data(tbs_bytes)
+
    return True