diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py index 77a31606e2..34eeb78e9e 100644 --- a/src/calibre/ebooks/mobi/debug/headers.py +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -549,6 +549,9 @@ class TextRecord(object): # {{{ raw = '%s : %r\n\n'%(k, v) f.write(raw.encode('utf-8')) + def __len__(self): + return len(self.raw) + # }}} diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py index f005c8b24f..488adef05d 100644 --- a/src/calibre/ebooks/mobi/debug/index.py +++ b/src/calibre/ebooks/mobi/debug/index.py @@ -158,9 +158,12 @@ class NCXIndex(Index): self.records = [] if self.table is not None: + NCXEntry = namedtuple('NCXEntry', 'index start length depth parent ' + 'first_child last_child title pos_fid') + for num, x in enumerate(self.table.iteritems()): text, tag_map = x - entry = default_entry.copy() + entry = e = default_entry.copy() entry['name'] = text entry['num'] = num @@ -179,7 +182,17 @@ class NCXIndex(Index): if tag == which: entry[name] = self.cncx.get(fieldvalue, default_entry[name]) + def refindx(e, name): + ans = e[name] + if ans < 0: + ans = None + return ans + + entry = NCXEntry(start=e['pos'], index=e['num'], + length=e['len'], depth=e['hlvl'], parent=refindx(e, + 'parent'), first_child=refindx(e, 'child1'), + last_child=refindx(e, 'childn'), title=e['text'], + pos_fid=e['pos_fid']) self.records.append(entry) - diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 40470ad2dd..a03205edd7 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -2,19 +2,20 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import (unicode_literals, division, absolute_import, print_function) +from future_builtins import map __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, imghdr, struct +import sys, os, imghdr, struct, textwrap from itertools import izip from calibre import CurrentDir from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, GuideIndex) -from calibre.ebooks.mobi.utils import read_font_record, decode_tbs +from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.reader.headers import NULL_INDEX @@ -185,38 +186,42 @@ class MOBIFile(object): payload)) def read_tbs(self): - from calibre.ebooks.mobi.writer8.tbs import (Entry, - collect_indexing_data) + from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC, + collect_indexing_data, encode_strands_as_sequences, + sequences_to_bytes) entry_map = [] for index in self.ncx_index: - enders = [e['pos'] for e in self.ncx_index if e['pos'] > - index['pos'] and - e['hlvl'] <= index['hlvl']] - end = min(enders+[len(self.raw_text)]) + vals = list(index)[:-1] + [None, None, None, None] + entry_map.append(Entry(*vals)) - entry_map.append(Entry(index=index['num'], title=index['text'], - depth=index['hlvl'], - parent=index['parent'] if index['parent'] > -1 else None, - first_child=index['child1'] if index['child1'] > -1 else None, - last_child=index['childn'] if index['childn'] > -1 else None, - start=index['pos'], length=end-index['pos'])) - indexing_data = collect_indexing_data(entry_map, - len(self.text_records)) - self.indexing_data = [] - for i, data in enumerate(indexing_data): + indexing_data = collect_indexing_data(entry_map, list(map(len, + self.text_records))) + self.indexing_data = [DOC + '\n' +textwrap.dedent('''\ + Index Entry lines are of the form: + depth:index_number [action] parent (index_num-parent) Geometry + + Where Geometry is the start and end of the index entry w.r.t + the start of the text record. + + ''')] + for i, strands in enumerate(indexing_data): rec = self.text_records[i] tbs_bytes = rec.trailing_data.get('indexing', b'') desc = ['Record #%d'%i] - for x in ('starts', 'completes', 'ends', 'spans'): - points = ['\t%d at depth: %d'%(e.index, e.depth) for e in - getattr(data, x)] - if points: - desc.append(x+':') - desc.extend(points) + for s, strand in enumerate(strands): + desc.append('Strand %d'%s) + for entries in strand.itervalues(): + for e in entries: + desc.append( + ' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%( + e.depth * (' ') + '- ', e.index, e.action, e.parent, + e.index-(e.parent or 0), e.start-i*RECORD_SIZE, + e.start+e.length-i*RECORD_SIZE)) desc.append('TBS Bytes: ' + format_bytes(tbs_bytes)) flag_sz = 3 sequences = [] + otbs = tbs_bytes while tbs_bytes: try: val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz) @@ -226,10 +231,16 @@ class MOBIFile(object): tbs_bytes = tbs_bytes[consumed:] extra = {bin(k):v for k, v in extra.iteritems()} sequences.append((val, extra)) - for i, seq in enumerate(sequences): - desc.append('Sequence #%d: %r %r'%(i, seq[0], seq[1])) + for j, seq in enumerate(sequences): + desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1])) if tbs_bytes: desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes)) + calculated_sequences = encode_strands_as_sequences(strands) + calculated_bytes = sequences_to_bytes(calculated_sequences) + if calculated_bytes != otbs: + print ('WARNING: TBS mismatch for record %d'%i) + desc.append('WARNING: TBS mismatch!') + desc.append('Calculated sequences: %r'%calculated_sequences) desc.append('') self.indexing_data.append('\n'.join(desc)) @@ -242,8 +253,7 @@ def inspect_mobi(mobi_file, ddir): with open(alltext, 'wb') as of: of.write(f.raw_text) - for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows', - 'tbs'): + for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'): os.mkdir(os.path.join(ddir, x)) for rec in f.text_records: @@ -269,7 +279,7 @@ def inspect_mobi(mobi_file, ddir): with open(os.path.join(ddir, 'guide.record'), 'wb') as fo: fo.write(str(f.guide_index).encode('utf-8')) - with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo: + with open(os.path.join(ddir, 'tbs.txt'), 'wb') as fo: fo.write(('\n'.join(f.indexing_data)).encode('utf-8')) for part in f.files: diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 97ed31a2e3..4e6719bb90 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -264,12 +264,14 @@ class KF8Writer(object): text = BytesIO(text) nrecords = 0 records_size = 0 + self.uncompressed_record_lengths = [] if self.compress: self.oeb.logger.info('\tCompressing markup...') while text.tell() < self.text_length: data, overlap = create_text_record(text) + self.uncompressed_record_lengths.append(len(data)) if self.compress: data = compress_doc(data) @@ -372,7 +374,7 @@ class KF8Writer(object): entry['length'] = get_next_start(entry) - entry['offset'] self.has_tbs = apply_trailing_byte_sequences(entries, self.records, - self.last_text_record_idx+1) + self.uncompressed_record_lengths) self.ncx_records = NCXIndex(entries)() def create_guide(self): diff --git a/src/calibre/ebooks/mobi/writer8/tbs.py b/src/calibre/ebooks/mobi/writer8/tbs.py index 36ecdcdf5c..6040c79709 100644 --- a/src/calibre/ebooks/mobi/writer8/tbs.py +++ b/src/calibre/ebooks/mobi/writer8/tbs.py @@ -7,103 +7,175 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -from collections import namedtuple -from functools import partial +DOC = ''' +Trailing Byte Sequences contain information about which index entries touch a +particular text record. Every text records has a set of trailing byte +sequences. In order to figure out the sequence for a given text record, you +have to first calculate all the indices that start, end, span and anre +contained within that text record. Then arrange the indices into 'strands', +where each strand is a hierarchical progression from the top level index down. +For the exact algorithm, see separate_strands(). The strands are then encoded +into 'sequences', see encode_strands_as_sequences() and finally the sequences +are turned into bytes. +''' +from collections import namedtuple, OrderedDict +from operator import attrgetter -from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data, +from calibre.ebooks.mobi.utils import (encode_trailing_data, encode_tbs) Entry = namedtuple('IndexEntry', 'index start length depth parent ' - 'first_child last_child title') -Data = namedtuple('Data', 'starts ends completes spans') + 'first_child last_child title action start_offset length_offset ' + 'text_record_length') -def collect_indexing_data(entries, number_of_text_records): +def fill_entry(entry, start_offset, text_record_length): + length_offset = start_offset + entry.length + if start_offset < 0: + action = 'spans' if length_offset > text_record_length else 'ends' + else: + action = 'starts' if length_offset > text_record_length else 'completes' + + return Entry(*(entry[:-4] + (action, start_offset, length_offset, + text_record_length))) + +def populate_strand(parent, entries): + ans = [parent] + children = [c for c in entries if c.parent == parent.index] + if children: + # Add first child to this strand, and recurse downwards + child = children[0] + entries.remove(child) + ans += populate_strand(child, entries) + else: + # Add any entries at the same depth that form a contiguous set of + # indices and belong to the same parent (these can all be + # represented as a single sequence with the 0b100 flag) + current_index = parent.index + siblings = [] + for entry in list(entries): + if (entry.depth == parent.depth and entry.parent == parent.parent + and entry.index == current_index+1): + current_index += 1 + entries.remove(entry) + children = [c for c in entries if c.parent == entry.index] + if children: + siblings += populate_strand(entry, entries) + break # Cannot add more siblings, as we have added children + else: + siblings.append(entry) + ans += siblings + return ans + +def separate_strands(entries): + ans = [] + while entries: + top, entries = entries[0], entries[1:] + strand = populate_strand(top, entries) + layers = OrderedDict() + for entry in strand: + if entry.depth not in layers: + layers[entry.depth] = [] + layers[entry.depth].append(entry) + ans.append(layers) + return ans + +def collect_indexing_data(entries, text_record_lengths): ''' For every text record calculate which index entries start, end, span or - are contained within that record.''' + are contained within that record. Arrange these entries in 'strands'. ''' data = [] - for i in xrange(number_of_text_records): - record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE - datum = Data([], [], [], []) - data.append(datum) + entries = sorted(entries, key=attrgetter('start')) + record_start = 0 + for rec_length in text_record_lengths: + next_record_start = record_start + rec_length + local_entries = [] for entry in entries: - end = entry.start + entry.length - 1 - if (entry.start >= next_record_start or end < record_start): - # This entry does not have any overlap with this record + if entry.start >= next_record_start: + # No more entries overlap this record + break + if entry.start + entry.length <= record_start: + # This entry does not touch this record continue - if (entry.start < record_start and end >= next_record_start): - # This entry spans this record - datum.spans.append(entry) - continue - if (entry.start >= record_start and end < next_record_start): - # This entry is contained in this record - datum.completes.append(entry) - if (entry.start >= record_start and end >= next_record_start): - # This entry starts in this record - datum.starts.append(entry) - continue - if (entry.start < record_start and end < next_record_start): - # This entry ends in this record - datum.ends.append(entry) + local_entries.append(fill_entry(entry, entry.start - record_start, + rec_length)) - for x in datum: - # Should be unnecessary as entries are already in this order, but - # best to be safe. - x.sort(key=lambda x:x.depth) + strands = separate_strands(local_entries) + data.append(strands) + record_start += rec_length return data -def generate_tbs_for_flat_index(indexing_data): +def encode_strands_as_sequences(strands, tbs_type=8): + ''' Encode the list of strands for a single text record into a list of + sequences, ready to be converted into TBS bytes. ''' ans = [] - record_type = 8 # 8 for KF8 0 for MOBI 6 - enc = partial(encode_tbs, flag_size=3) - for datum in indexing_data: - tbs = b'' - extra = {0b010 : record_type} - if not (datum.starts or datum.ends or datum.completes or datum.spans): - # No index entry touches this record - pass - elif datum.spans: - extra[0b001] = 0 - tbs = enc(datum.spans[0].index, extra) - else: - starts, ends, completes = datum[:3] - if (not completes and len(starts) + len(ends) == 1): - # Either has the first or the last index, and no other indices. - node = (starts+ends)[0] - tbs = enc(node.index, extra) - else: - # This record contains the end of an index and - # some complete index entries. Or it contains some complete - # entries and a start. Or it contains an end, a start and - # optionally some completes. In every case, we encode the first - # entry to touch this record and the number of entries - # that touch this record. - nodes = starts + completes + ends - nodes.sort(key=lambda x:x.index) - extra[0b100] = len(nodes) - tbs = enc(nodes[0].index, extra) - ans.append(tbs) + last_index = None + max_length_offset = 0 + first_entry = None + for strand in strands: + for entries in strand.itervalues(): + for entry in entries: + if first_entry is None: + first_entry = entry + if entry.length_offset > max_length_offset: + max_length_offset = entry.length_offset + + for strand in strands: + strand_seqs = [] + for depth, entries in strand.iteritems(): + extra = {} + if entries[-1].action == 'spans': + extra[0b1] = 0 + elif False and ( + entries[-1].length_offset < entries[-1].text_record_length and + entries[-1].action == 'completes' and + entries[-1].length_offset != max_length_offset): + # I can't figure out exactly when kindlegen decides to insert + # this + extra[0b1] = entries[-1].length_offset + + if entries[0] is first_entry: + extra[0b10] = tbs_type + + if len(entries) > 1: + extra[0b100] = len(entries) + + index = entries[0].index - (entries[0].parent or 0) + if ans and not strand_seqs: + extra[0b1000] = True + index = last_index - entries[0].index + last_index = entries[-1].index + strand_seqs.append((index, extra)) + + # Handle the case of consecutive action == 'spans' entries + for i, seq in enumerate(strand_seqs): + if i + 1 < len(strand_seqs): + if 0b1 in seq[1] and 0b1 in strand_seqs[i+1][1]: + del seq[1][0b1] + ans.extend(strand_seqs) return ans -def apply_trailing_byte_sequences(index_table, records, number_of_text_records): +def sequences_to_bytes(sequences): + ans = [] + flag_size = 3 + for val, extra in sequences: + ans.append(encode_tbs(val, extra, flag_size)) + flag_size = 4 + return b''.join(ans) + +def apply_trailing_byte_sequences(index_table, records, text_record_lengths): entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'], r.get('parent', None), r.get('first_child', None), r.get('last_child', - None), r['label']) for r in index_table) + None), r['label'], None, None, None, None) for r in index_table) - indexing_data = collect_indexing_data(entries, number_of_text_records) - max_depth = max(e['depth'] for e in index_table) - if max_depth > 0: - # TODO: Implement for hierarchical ToCs - tbs = [] - else: - tbs = generate_tbs_for_flat_index(indexing_data) - if not tbs: - return False - for i, tbs_bytes in enumerate(tbs): + indexing_data = collect_indexing_data(entries, text_record_lengths) + for i, strands in enumerate(indexing_data): + sequences = encode_strands_as_sequences(strands) + tbs_bytes = sequences_to_bytes(sequences) records[i+1] += encode_trailing_data(tbs_bytes) + return True