diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py index 6065d12e5e..f005c8b24f 100644 --- a/src/calibre/ebooks/mobi/debug/index.py +++ b/src/calibre/ebooks/mobi/debug/index.py @@ -82,6 +82,9 @@ class Index(object): def __str__(self): return '\n'.join(self.render()) + def __iter__(self): + return iter(self.records) + class SKELIndex(Index): def __init__(self, skelidx, records, codec): diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 21ed11fc51..c5cd8d2d69 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -14,7 +14,7 @@ from calibre import CurrentDir from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, GuideIndex) -from calibre.ebooks.mobi.utils import read_font_record +from calibre.ebooks.mobi.utils import read_font_record, decode_tbs from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.reader.headers import NULL_INDEX @@ -88,6 +88,7 @@ class MOBIFile(object): self.read_fdst() self.read_indices() self.build_files() + self.read_tbs() def print_header(self, f=sys.stdout): print (str(self.mf.palmdb).encode('utf-8'), file=f) @@ -183,6 +184,45 @@ class MOBIFile(object): self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext), payload)) + def read_tbs(self): + from calibre.ebooks.mobi.writer8.tbs import (Entry, + collect_indexing_data) + entry_map = [] + for index in self.ncx_index: + enders = [e['pos'] for e in self.ncx_index if e['pos'] > + index['pos'] and + e['hlvl'] <= index['hlvl']] + end = min(enders+[len(self.raw_text)]) + + entry_map.append(Entry(index=index['num'], title=index['text'], + depth=index['hlvl'], + parent=index['parent'] if index['parent'] > -1 else None, + first_child=index['child1'] if index['child1'] > -1 else None, + last_child=index['childn'] if index['childn'] > -1 else None, + start=index['pos'], length=end-index['pos'])) + + indexing_data = collect_indexing_data(entry_map, + len(self.text_records)) + self.indexing_data = [] + for i, data in enumerate(indexing_data): + rec = self.text_records[i] + tbs_bytes = rec.trailing_data.get('indexing', b'') + desc = ['Record #%d'%i] + for x in ('starts', 'completes', 'ends', 'spans'): + points = ['\t%d at depth: %d'%(e.index, e.depth) for e in + getattr(data, x)] + if points: + desc.append(x+':') + desc.extend(points) + desc.append('TBS Bytes: ' + format_bytes(tbs_bytes)) + val, extra, consumed = decode_tbs(tbs_bytes, flag_size=3) + extra = {bin(k):v for k, v in extra.iteritems()} + desc.append('First sequence: %r %r'%(val, extra)) + byts = tbs_bytes[consumed:] + if byts: + desc.append('Remaining bytes: %s'%format_bytes(byts)) + desc.append('') + self.indexing_data.append('\n'.join(desc)) def inspect_mobi(mobi_file, ddir): f = MOBIFile(mobi_file) @@ -193,7 +233,8 @@ def inspect_mobi(mobi_file, ddir): with open(alltext, 'wb') as of: of.write(f.raw_text) - for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'): + for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows', + 'tbs'): os.mkdir(os.path.join(ddir, x)) for rec in f.text_records: @@ -219,6 +260,8 @@ def inspect_mobi(mobi_file, ddir): with open(os.path.join(ddir, 'guide.record'), 'wb') as fo: fo.write(str(f.guide_index).encode('utf-8')) + with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo: + fo.write(('\n'.join(f.indexing_data)).encode('utf-8')) for part in f.files: part.dump(os.path.join(ddir, 'files')) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index f064fd2625..4737224336 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -397,7 +397,10 @@ class MobiWriter(object): header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this header_fields['fdst_record'] = NULL_INDEX header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1 - header_fields['extra_data_flags'] = 0b11 + extra_data_flags = 0b1 # Has multibyte overlap bytes + if self.primary_index_record_idx is not None: + extra_data_flags |= 0b10 + header_fields['extra_data_flags'] = extra_data_flags for k, v in {'last_text_record':'last_text_record_idx', 'first_non_text_record':'first_non_text_record_idx', diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 5e2c75b267..ef34692fd3 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -27,6 +27,7 @@ from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex, ChunkIndex, GuideIndex) from calibre.ebooks.mobi.writer8.mobi import KF8Book +from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences XML_DOCS = OEB_DOCS | {SVG_MIME} @@ -39,6 +40,7 @@ class KF8Writer(object): def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.compress = not self.opts.dont_compress + self.has_tbs = False self.log.info('Creating KF8 output') self.used_images = set() self.resources = resources @@ -363,6 +365,8 @@ class KF8Writer(object): for entry in entries: entry['length'] = get_next_start(entry) - entry['offset'] + self.has_tbs = apply_trailing_byte_sequences(entries, self.records, + self.last_text_record_idx+1) self.ncx_records = NCXIndex(entries)() def create_guide(self): diff --git a/src/calibre/ebooks/mobi/writer8/mobi.py b/src/calibre/ebooks/mobi/writer8/mobi.py index ff096f350b..7cdfd5a4b1 100644 --- a/src/calibre/ebooks/mobi/writer8/mobi.py +++ b/src/calibre/ebooks/mobi/writer8/mobi.py @@ -250,6 +250,8 @@ class KF8Book(object): self.full_title = utf8_text(unicode(metadata.title[0])) self.title_length = len(self.full_title) self.extra_data_flags = 0b1 + if writer.has_tbs: + self.extra_data_flags |= 0b10 self.uid = random.randint(0, 0xffffffff) self.language_code = iana2mobi(str(metadata.language[0])) diff --git a/src/calibre/ebooks/mobi/writer8/tbs.py b/src/calibre/ebooks/mobi/writer8/tbs.py new file mode 100644 index 0000000000..36ecdcdf5c --- /dev/null +++ b/src/calibre/ebooks/mobi/writer8/tbs.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import namedtuple +from functools import partial + +from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data, + encode_tbs) + +Entry = namedtuple('IndexEntry', 'index start length depth parent ' + 'first_child last_child title') +Data = namedtuple('Data', 'starts ends completes spans') + +def collect_indexing_data(entries, number_of_text_records): + ''' For every text record calculate which index entries start, end, span or + are contained within that record.''' + + data = [] + for i in xrange(number_of_text_records): + record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE + datum = Data([], [], [], []) + data.append(datum) + + for entry in entries: + end = entry.start + entry.length - 1 + if (entry.start >= next_record_start or end < record_start): + # This entry does not have any overlap with this record + continue + if (entry.start < record_start and end >= next_record_start): + # This entry spans this record + datum.spans.append(entry) + continue + if (entry.start >= record_start and end < next_record_start): + # This entry is contained in this record + datum.completes.append(entry) + if (entry.start >= record_start and end >= next_record_start): + # This entry starts in this record + datum.starts.append(entry) + continue + if (entry.start < record_start and end < next_record_start): + # This entry ends in this record + datum.ends.append(entry) + + for x in datum: + # Should be unnecessary as entries are already in this order, but + # best to be safe. + x.sort(key=lambda x:x.depth) + + return data + +def generate_tbs_for_flat_index(indexing_data): + ans = [] + record_type = 8 # 8 for KF8 0 for MOBI 6 + enc = partial(encode_tbs, flag_size=3) + for datum in indexing_data: + tbs = b'' + extra = {0b010 : record_type} + if not (datum.starts or datum.ends or datum.completes or datum.spans): + # No index entry touches this record + pass + elif datum.spans: + extra[0b001] = 0 + tbs = enc(datum.spans[0].index, extra) + else: + starts, ends, completes = datum[:3] + if (not completes and len(starts) + len(ends) == 1): + # Either has the first or the last index, and no other indices. + node = (starts+ends)[0] + tbs = enc(node.index, extra) + else: + # This record contains the end of an index and + # some complete index entries. Or it contains some complete + # entries and a start. Or it contains an end, a start and + # optionally some completes. In every case, we encode the first + # entry to touch this record and the number of entries + # that touch this record. + nodes = starts + completes + ends + nodes.sort(key=lambda x:x.index) + extra[0b100] = len(nodes) + tbs = enc(nodes[0].index, extra) + ans.append(tbs) + + return ans + +def apply_trailing_byte_sequences(index_table, records, number_of_text_records): + entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'], + r.get('parent', None), r.get('first_child', None), r.get('last_child', + None), r['label']) for r in index_table) + + indexing_data = collect_indexing_data(entries, number_of_text_records) + max_depth = max(e['depth'] for e in index_table) + if max_depth > 0: + # TODO: Implement for hierarchical ToCs + tbs = [] + else: + tbs = generate_tbs_for_flat_index(indexing_data) + if not tbs: + return False + for i, tbs_bytes in enumerate(tbs): + records[i+1] += encode_trailing_data(tbs_bytes) + return True + +