KF8 Output: Proper algorithm for calculatin trailing bytes for all types of ToCs

This commit is contained in:
Kovid Goyal 2012-04-28 22:17:42 +05:30
parent f4e6d943ee
commit 89e01414c5
5 changed files with 206 additions and 106 deletions

View File

@ -549,6 +549,9 @@ class TextRecord(object): # {{{
raw = '%s : %r\n\n'%(k, v) raw = '%s : %r\n\n'%(k, v)
f.write(raw.encode('utf-8')) f.write(raw.encode('utf-8'))
def __len__(self):
return len(self.raw)
# }}} # }}}

View File

@ -158,9 +158,12 @@ class NCXIndex(Index):
self.records = [] self.records = []
if self.table is not None: if self.table is not None:
NCXEntry = namedtuple('NCXEntry', 'index start length depth parent '
'first_child last_child title pos_fid')
for num, x in enumerate(self.table.iteritems()): for num, x in enumerate(self.table.iteritems()):
text, tag_map = x text, tag_map = x
entry = default_entry.copy() entry = e = default_entry.copy()
entry['name'] = text entry['name'] = text
entry['num'] = num entry['num'] = num
@ -179,7 +182,17 @@ class NCXIndex(Index):
if tag == which: if tag == which:
entry[name] = self.cncx.get(fieldvalue, entry[name] = self.cncx.get(fieldvalue,
default_entry[name]) default_entry[name])
def refindx(e, name):
ans = e[name]
if ans < 0:
ans = None
return ans
entry = NCXEntry(start=e['pos'], index=e['num'],
length=e['len'], depth=e['hlvl'], parent=refindx(e,
'parent'), first_child=refindx(e, 'child1'),
last_child=refindx(e, 'childn'), title=e['text'],
pos_fid=e['pos_fid'])
self.records.append(entry) self.records.append(entry)

View File

@ -2,19 +2,20 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, from __future__ import (unicode_literals, division, absolute_import,
print_function) print_function)
from future_builtins import map
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys, os, imghdr, struct import sys, os, imghdr, struct, textwrap
from itertools import izip from itertools import izip
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
GuideIndex) GuideIndex)
from calibre.ebooks.mobi.utils import read_font_record, decode_tbs from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.headers import NULL_INDEX
@ -185,38 +186,42 @@ class MOBIFile(object):
payload)) payload))
def read_tbs(self): def read_tbs(self):
from calibre.ebooks.mobi.writer8.tbs import (Entry, from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC,
collect_indexing_data) collect_indexing_data, encode_strands_as_sequences,
sequences_to_bytes)
entry_map = [] entry_map = []
for index in self.ncx_index: for index in self.ncx_index:
enders = [e['pos'] for e in self.ncx_index if e['pos'] > vals = list(index)[:-1] + [None, None, None, None]
index['pos'] and entry_map.append(Entry(*vals))
e['hlvl'] <= index['hlvl']]
end = min(enders+[len(self.raw_text)])
entry_map.append(Entry(index=index['num'], title=index['text'],
depth=index['hlvl'],
parent=index['parent'] if index['parent'] > -1 else None,
first_child=index['child1'] if index['child1'] > -1 else None,
last_child=index['childn'] if index['childn'] > -1 else None,
start=index['pos'], length=end-index['pos']))
indexing_data = collect_indexing_data(entry_map, indexing_data = collect_indexing_data(entry_map, list(map(len,
len(self.text_records)) self.text_records)))
self.indexing_data = [] self.indexing_data = [DOC + '\n' +textwrap.dedent('''\
for i, data in enumerate(indexing_data): Index Entry lines are of the form:
depth:index_number [action] parent (index_num-parent) Geometry
Where Geometry is the start and end of the index entry w.r.t
the start of the text record.
''')]
for i, strands in enumerate(indexing_data):
rec = self.text_records[i] rec = self.text_records[i]
tbs_bytes = rec.trailing_data.get('indexing', b'') tbs_bytes = rec.trailing_data.get('indexing', b'')
desc = ['Record #%d'%i] desc = ['Record #%d'%i]
for x in ('starts', 'completes', 'ends', 'spans'): for s, strand in enumerate(strands):
points = ['\t%d at depth: %d'%(e.index, e.depth) for e in desc.append('Strand %d'%s)
getattr(data, x)] for entries in strand.itervalues():
if points: for e in entries:
desc.append(x+':') desc.append(
desc.extend(points) ' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%(
e.depth * (' ') + '- ', e.index, e.action, e.parent,
e.index-(e.parent or 0), e.start-i*RECORD_SIZE,
e.start+e.length-i*RECORD_SIZE))
desc.append('TBS Bytes: ' + format_bytes(tbs_bytes)) desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
flag_sz = 3 flag_sz = 3
sequences = [] sequences = []
otbs = tbs_bytes
while tbs_bytes: while tbs_bytes:
try: try:
val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz) val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
@ -226,10 +231,16 @@ class MOBIFile(object):
tbs_bytes = tbs_bytes[consumed:] tbs_bytes = tbs_bytes[consumed:]
extra = {bin(k):v for k, v in extra.iteritems()} extra = {bin(k):v for k, v in extra.iteritems()}
sequences.append((val, extra)) sequences.append((val, extra))
for i, seq in enumerate(sequences): for j, seq in enumerate(sequences):
desc.append('Sequence #%d: %r %r'%(i, seq[0], seq[1])) desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1]))
if tbs_bytes: if tbs_bytes:
desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes)) desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
calculated_sequences = encode_strands_as_sequences(strands)
calculated_bytes = sequences_to_bytes(calculated_sequences)
if calculated_bytes != otbs:
print ('WARNING: TBS mismatch for record %d'%i)
desc.append('WARNING: TBS mismatch!')
desc.append('Calculated sequences: %r'%calculated_sequences)
desc.append('') desc.append('')
self.indexing_data.append('\n'.join(desc)) self.indexing_data.append('\n'.join(desc))
@ -242,8 +253,7 @@ def inspect_mobi(mobi_file, ddir):
with open(alltext, 'wb') as of: with open(alltext, 'wb') as of:
of.write(f.raw_text) of.write(f.raw_text)
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows', for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'):
'tbs'):
os.mkdir(os.path.join(ddir, x)) os.mkdir(os.path.join(ddir, x))
for rec in f.text_records: for rec in f.text_records:
@ -269,7 +279,7 @@ def inspect_mobi(mobi_file, ddir):
with open(os.path.join(ddir, 'guide.record'), 'wb') as fo: with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
fo.write(str(f.guide_index).encode('utf-8')) fo.write(str(f.guide_index).encode('utf-8'))
with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo: with open(os.path.join(ddir, 'tbs.txt'), 'wb') as fo:
fo.write(('\n'.join(f.indexing_data)).encode('utf-8')) fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
for part in f.files: for part in f.files:

View File

@ -264,12 +264,14 @@ class KF8Writer(object):
text = BytesIO(text) text = BytesIO(text)
nrecords = 0 nrecords = 0
records_size = 0 records_size = 0
self.uncompressed_record_lengths = []
if self.compress: if self.compress:
self.oeb.logger.info('\tCompressing markup...') self.oeb.logger.info('\tCompressing markup...')
while text.tell() < self.text_length: while text.tell() < self.text_length:
data, overlap = create_text_record(text) data, overlap = create_text_record(text)
self.uncompressed_record_lengths.append(len(data))
if self.compress: if self.compress:
data = compress_doc(data) data = compress_doc(data)
@ -372,7 +374,7 @@ class KF8Writer(object):
entry['length'] = get_next_start(entry) - entry['offset'] entry['length'] = get_next_start(entry) - entry['offset']
self.has_tbs = apply_trailing_byte_sequences(entries, self.records, self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
self.last_text_record_idx+1) self.uncompressed_record_lengths)
self.ncx_records = NCXIndex(entries)() self.ncx_records = NCXIndex(entries)()
def create_guide(self): def create_guide(self):

View File

@ -7,103 +7,175 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from collections import namedtuple DOC = '''
from functools import partial Trailing Byte Sequences contain information about which index entries touch a
particular text record. Every text records has a set of trailing byte
sequences. In order to figure out the sequence for a given text record, you
have to first calculate all the indices that start, end, span and anre
contained within that text record. Then arrange the indices into 'strands',
where each strand is a hierarchical progression from the top level index down.
For the exact algorithm, see separate_strands(). The strands are then encoded
into 'sequences', see encode_strands_as_sequences() and finally the sequences
are turned into bytes.
'''
from collections import namedtuple, OrderedDict
from operator import attrgetter
from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data, from calibre.ebooks.mobi.utils import (encode_trailing_data,
encode_tbs) encode_tbs)
Entry = namedtuple('IndexEntry', 'index start length depth parent ' Entry = namedtuple('IndexEntry', 'index start length depth parent '
'first_child last_child title') 'first_child last_child title action start_offset length_offset '
Data = namedtuple('Data', 'starts ends completes spans') 'text_record_length')
def collect_indexing_data(entries, number_of_text_records): def fill_entry(entry, start_offset, text_record_length):
length_offset = start_offset + entry.length
if start_offset < 0:
action = 'spans' if length_offset > text_record_length else 'ends'
else:
action = 'starts' if length_offset > text_record_length else 'completes'
return Entry(*(entry[:-4] + (action, start_offset, length_offset,
text_record_length)))
def populate_strand(parent, entries):
ans = [parent]
children = [c for c in entries if c.parent == parent.index]
if children:
# Add first child to this strand, and recurse downwards
child = children[0]
entries.remove(child)
ans += populate_strand(child, entries)
else:
# Add any entries at the same depth that form a contiguous set of
# indices and belong to the same parent (these can all be
# represented as a single sequence with the 0b100 flag)
current_index = parent.index
siblings = []
for entry in list(entries):
if (entry.depth == parent.depth and entry.parent == parent.parent
and entry.index == current_index+1):
current_index += 1
entries.remove(entry)
children = [c for c in entries if c.parent == entry.index]
if children:
siblings += populate_strand(entry, entries)
break # Cannot add more siblings, as we have added children
else:
siblings.append(entry)
ans += siblings
return ans
def separate_strands(entries):
ans = []
while entries:
top, entries = entries[0], entries[1:]
strand = populate_strand(top, entries)
layers = OrderedDict()
for entry in strand:
if entry.depth not in layers:
layers[entry.depth] = []
layers[entry.depth].append(entry)
ans.append(layers)
return ans
def collect_indexing_data(entries, text_record_lengths):
''' For every text record calculate which index entries start, end, span or ''' For every text record calculate which index entries start, end, span or
are contained within that record.''' are contained within that record. Arrange these entries in 'strands'. '''
data = [] data = []
for i in xrange(number_of_text_records): entries = sorted(entries, key=attrgetter('start'))
record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE record_start = 0
datum = Data([], [], [], []) for rec_length in text_record_lengths:
data.append(datum) next_record_start = record_start + rec_length
local_entries = []
for entry in entries: for entry in entries:
end = entry.start + entry.length - 1 if entry.start >= next_record_start:
if (entry.start >= next_record_start or end < record_start): # No more entries overlap this record
# This entry does not have any overlap with this record break
if entry.start + entry.length <= record_start:
# This entry does not touch this record
continue continue
if (entry.start < record_start and end >= next_record_start): local_entries.append(fill_entry(entry, entry.start - record_start,
# This entry spans this record rec_length))
datum.spans.append(entry)
continue
if (entry.start >= record_start and end < next_record_start):
# This entry is contained in this record
datum.completes.append(entry)
if (entry.start >= record_start and end >= next_record_start):
# This entry starts in this record
datum.starts.append(entry)
continue
if (entry.start < record_start and end < next_record_start):
# This entry ends in this record
datum.ends.append(entry)
for x in datum: strands = separate_strands(local_entries)
# Should be unnecessary as entries are already in this order, but data.append(strands)
# best to be safe. record_start += rec_length
x.sort(key=lambda x:x.depth)
return data return data
def generate_tbs_for_flat_index(indexing_data): def encode_strands_as_sequences(strands, tbs_type=8):
''' Encode the list of strands for a single text record into a list of
sequences, ready to be converted into TBS bytes. '''
ans = [] ans = []
record_type = 8 # 8 for KF8 0 for MOBI 6 last_index = None
enc = partial(encode_tbs, flag_size=3) max_length_offset = 0
for datum in indexing_data: first_entry = None
tbs = b'' for strand in strands:
extra = {0b010 : record_type} for entries in strand.itervalues():
if not (datum.starts or datum.ends or datum.completes or datum.spans): for entry in entries:
# No index entry touches this record if first_entry is None:
pass first_entry = entry
elif datum.spans: if entry.length_offset > max_length_offset:
extra[0b001] = 0 max_length_offset = entry.length_offset
tbs = enc(datum.spans[0].index, extra)
else: for strand in strands:
starts, ends, completes = datum[:3] strand_seqs = []
if (not completes and len(starts) + len(ends) == 1): for depth, entries in strand.iteritems():
# Either has the first or the last index, and no other indices. extra = {}
node = (starts+ends)[0] if entries[-1].action == 'spans':
tbs = enc(node.index, extra) extra[0b1] = 0
else: elif False and (
# This record contains the end of an index and entries[-1].length_offset < entries[-1].text_record_length and
# some complete index entries. Or it contains some complete entries[-1].action == 'completes' and
# entries and a start. Or it contains an end, a start and entries[-1].length_offset != max_length_offset):
# optionally some completes. In every case, we encode the first # I can't figure out exactly when kindlegen decides to insert
# entry to touch this record and the number of entries # this
# that touch this record. extra[0b1] = entries[-1].length_offset
nodes = starts + completes + ends
nodes.sort(key=lambda x:x.index) if entries[0] is first_entry:
extra[0b100] = len(nodes) extra[0b10] = tbs_type
tbs = enc(nodes[0].index, extra)
ans.append(tbs) if len(entries) > 1:
extra[0b100] = len(entries)
index = entries[0].index - (entries[0].parent or 0)
if ans and not strand_seqs:
extra[0b1000] = True
index = last_index - entries[0].index
last_index = entries[-1].index
strand_seqs.append((index, extra))
# Handle the case of consecutive action == 'spans' entries
for i, seq in enumerate(strand_seqs):
if i + 1 < len(strand_seqs):
if 0b1 in seq[1] and 0b1 in strand_seqs[i+1][1]:
del seq[1][0b1]
ans.extend(strand_seqs)
return ans return ans
def apply_trailing_byte_sequences(index_table, records, number_of_text_records): def sequences_to_bytes(sequences):
ans = []
flag_size = 3
for val, extra in sequences:
ans.append(encode_tbs(val, extra, flag_size))
flag_size = 4
return b''.join(ans)
def apply_trailing_byte_sequences(index_table, records, text_record_lengths):
entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'], entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'],
r.get('parent', None), r.get('first_child', None), r.get('last_child', r.get('parent', None), r.get('first_child', None), r.get('last_child',
None), r['label']) for r in index_table) None), r['label'], None, None, None, None) for r in index_table)
indexing_data = collect_indexing_data(entries, number_of_text_records) indexing_data = collect_indexing_data(entries, text_record_lengths)
max_depth = max(e['depth'] for e in index_table) for i, strands in enumerate(indexing_data):
if max_depth > 0: sequences = encode_strands_as_sequences(strands)
# TODO: Implement for hierarchical ToCs tbs_bytes = sequences_to_bytes(sequences)
tbs = []
else:
tbs = generate_tbs_for_flat_index(indexing_data)
if not tbs:
return False
for i, tbs_bytes in enumerate(tbs):
records[i+1] += encode_trailing_data(tbs_bytes) records[i+1] += encode_trailing_data(tbs_bytes)
return True return True