mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8 Output: Proper algorithm for calculatin trailing bytes for all types of ToCs
This commit is contained in:
parent
f4e6d943ee
commit
89e01414c5
@ -549,6 +549,9 @@ class TextRecord(object): # {{{
|
||||
raw = '%s : %r\n\n'%(k, v)
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.raw)
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
|
@ -158,9 +158,12 @@ class NCXIndex(Index):
|
||||
self.records = []
|
||||
|
||||
if self.table is not None:
|
||||
NCXEntry = namedtuple('NCXEntry', 'index start length depth parent '
|
||||
'first_child last_child title pos_fid')
|
||||
|
||||
for num, x in enumerate(self.table.iteritems()):
|
||||
text, tag_map = x
|
||||
entry = default_entry.copy()
|
||||
entry = e = default_entry.copy()
|
||||
entry['name'] = text
|
||||
entry['num'] = num
|
||||
|
||||
@ -179,7 +182,17 @@ class NCXIndex(Index):
|
||||
if tag == which:
|
||||
entry[name] = self.cncx.get(fieldvalue,
|
||||
default_entry[name])
|
||||
def refindx(e, name):
|
||||
ans = e[name]
|
||||
if ans < 0:
|
||||
ans = None
|
||||
return ans
|
||||
|
||||
entry = NCXEntry(start=e['pos'], index=e['num'],
|
||||
length=e['len'], depth=e['hlvl'], parent=refindx(e,
|
||||
'parent'), first_child=refindx(e, 'child1'),
|
||||
last_child=refindx(e, 'childn'), title=e['text'],
|
||||
pos_fid=e['pos_fid'])
|
||||
self.records.append(entry)
|
||||
|
||||
|
||||
|
||||
|
@ -2,19 +2,20 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
from future_builtins import map
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os, imghdr, struct
|
||||
import sys, os, imghdr, struct, textwrap
|
||||
from itertools import izip
|
||||
|
||||
from calibre import CurrentDir
|
||||
from calibre.ebooks.mobi.debug.headers import TextRecord
|
||||
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
|
||||
GuideIndex)
|
||||
from calibre.ebooks.mobi.utils import read_font_record, decode_tbs
|
||||
from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
|
||||
from calibre.ebooks.mobi.debug import format_bytes
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
|
||||
@ -185,38 +186,42 @@ class MOBIFile(object):
|
||||
payload))
|
||||
|
||||
def read_tbs(self):
|
||||
from calibre.ebooks.mobi.writer8.tbs import (Entry,
|
||||
collect_indexing_data)
|
||||
from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC,
|
||||
collect_indexing_data, encode_strands_as_sequences,
|
||||
sequences_to_bytes)
|
||||
entry_map = []
|
||||
for index in self.ncx_index:
|
||||
enders = [e['pos'] for e in self.ncx_index if e['pos'] >
|
||||
index['pos'] and
|
||||
e['hlvl'] <= index['hlvl']]
|
||||
end = min(enders+[len(self.raw_text)])
|
||||
vals = list(index)[:-1] + [None, None, None, None]
|
||||
entry_map.append(Entry(*vals))
|
||||
|
||||
entry_map.append(Entry(index=index['num'], title=index['text'],
|
||||
depth=index['hlvl'],
|
||||
parent=index['parent'] if index['parent'] > -1 else None,
|
||||
first_child=index['child1'] if index['child1'] > -1 else None,
|
||||
last_child=index['childn'] if index['childn'] > -1 else None,
|
||||
start=index['pos'], length=end-index['pos']))
|
||||
|
||||
indexing_data = collect_indexing_data(entry_map,
|
||||
len(self.text_records))
|
||||
self.indexing_data = []
|
||||
for i, data in enumerate(indexing_data):
|
||||
indexing_data = collect_indexing_data(entry_map, list(map(len,
|
||||
self.text_records)))
|
||||
self.indexing_data = [DOC + '\n' +textwrap.dedent('''\
|
||||
Index Entry lines are of the form:
|
||||
depth:index_number [action] parent (index_num-parent) Geometry
|
||||
|
||||
Where Geometry is the start and end of the index entry w.r.t
|
||||
the start of the text record.
|
||||
|
||||
''')]
|
||||
for i, strands in enumerate(indexing_data):
|
||||
rec = self.text_records[i]
|
||||
tbs_bytes = rec.trailing_data.get('indexing', b'')
|
||||
desc = ['Record #%d'%i]
|
||||
for x in ('starts', 'completes', 'ends', 'spans'):
|
||||
points = ['\t%d at depth: %d'%(e.index, e.depth) for e in
|
||||
getattr(data, x)]
|
||||
if points:
|
||||
desc.append(x+':')
|
||||
desc.extend(points)
|
||||
for s, strand in enumerate(strands):
|
||||
desc.append('Strand %d'%s)
|
||||
for entries in strand.itervalues():
|
||||
for e in entries:
|
||||
desc.append(
|
||||
' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%(
|
||||
e.depth * (' ') + '- ', e.index, e.action, e.parent,
|
||||
e.index-(e.parent or 0), e.start-i*RECORD_SIZE,
|
||||
e.start+e.length-i*RECORD_SIZE))
|
||||
desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
|
||||
flag_sz = 3
|
||||
sequences = []
|
||||
otbs = tbs_bytes
|
||||
while tbs_bytes:
|
||||
try:
|
||||
val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
|
||||
@ -226,10 +231,16 @@ class MOBIFile(object):
|
||||
tbs_bytes = tbs_bytes[consumed:]
|
||||
extra = {bin(k):v for k, v in extra.iteritems()}
|
||||
sequences.append((val, extra))
|
||||
for i, seq in enumerate(sequences):
|
||||
desc.append('Sequence #%d: %r %r'%(i, seq[0], seq[1]))
|
||||
for j, seq in enumerate(sequences):
|
||||
desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1]))
|
||||
if tbs_bytes:
|
||||
desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
|
||||
calculated_sequences = encode_strands_as_sequences(strands)
|
||||
calculated_bytes = sequences_to_bytes(calculated_sequences)
|
||||
if calculated_bytes != otbs:
|
||||
print ('WARNING: TBS mismatch for record %d'%i)
|
||||
desc.append('WARNING: TBS mismatch!')
|
||||
desc.append('Calculated sequences: %r'%calculated_sequences)
|
||||
desc.append('')
|
||||
self.indexing_data.append('\n'.join(desc))
|
||||
|
||||
@ -242,8 +253,7 @@ def inspect_mobi(mobi_file, ddir):
|
||||
with open(alltext, 'wb') as of:
|
||||
of.write(f.raw_text)
|
||||
|
||||
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows',
|
||||
'tbs'):
|
||||
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'):
|
||||
os.mkdir(os.path.join(ddir, x))
|
||||
|
||||
for rec in f.text_records:
|
||||
@ -269,7 +279,7 @@ def inspect_mobi(mobi_file, ddir):
|
||||
with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
|
||||
fo.write(str(f.guide_index).encode('utf-8'))
|
||||
|
||||
with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo:
|
||||
with open(os.path.join(ddir, 'tbs.txt'), 'wb') as fo:
|
||||
fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
|
||||
|
||||
for part in f.files:
|
||||
|
@ -264,12 +264,14 @@ class KF8Writer(object):
|
||||
text = BytesIO(text)
|
||||
nrecords = 0
|
||||
records_size = 0
|
||||
self.uncompressed_record_lengths = []
|
||||
|
||||
if self.compress:
|
||||
self.oeb.logger.info('\tCompressing markup...')
|
||||
|
||||
while text.tell() < self.text_length:
|
||||
data, overlap = create_text_record(text)
|
||||
self.uncompressed_record_lengths.append(len(data))
|
||||
if self.compress:
|
||||
data = compress_doc(data)
|
||||
|
||||
@ -372,7 +374,7 @@ class KF8Writer(object):
|
||||
entry['length'] = get_next_start(entry) - entry['offset']
|
||||
|
||||
self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
|
||||
self.last_text_record_idx+1)
|
||||
self.uncompressed_record_lengths)
|
||||
self.ncx_records = NCXIndex(entries)()
|
||||
|
||||
def create_guide(self):
|
||||
|
@ -7,103 +7,175 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
DOC = '''
|
||||
Trailing Byte Sequences contain information about which index entries touch a
|
||||
particular text record. Every text records has a set of trailing byte
|
||||
sequences. In order to figure out the sequence for a given text record, you
|
||||
have to first calculate all the indices that start, end, span and anre
|
||||
contained within that text record. Then arrange the indices into 'strands',
|
||||
where each strand is a hierarchical progression from the top level index down.
|
||||
For the exact algorithm, see separate_strands(). The strands are then encoded
|
||||
into 'sequences', see encode_strands_as_sequences() and finally the sequences
|
||||
are turned into bytes.
|
||||
'''
|
||||
from collections import namedtuple, OrderedDict
|
||||
from operator import attrgetter
|
||||
|
||||
from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data,
|
||||
from calibre.ebooks.mobi.utils import (encode_trailing_data,
|
||||
encode_tbs)
|
||||
|
||||
Entry = namedtuple('IndexEntry', 'index start length depth parent '
|
||||
'first_child last_child title')
|
||||
Data = namedtuple('Data', 'starts ends completes spans')
|
||||
'first_child last_child title action start_offset length_offset '
|
||||
'text_record_length')
|
||||
|
||||
def collect_indexing_data(entries, number_of_text_records):
|
||||
def fill_entry(entry, start_offset, text_record_length):
|
||||
length_offset = start_offset + entry.length
|
||||
if start_offset < 0:
|
||||
action = 'spans' if length_offset > text_record_length else 'ends'
|
||||
else:
|
||||
action = 'starts' if length_offset > text_record_length else 'completes'
|
||||
|
||||
return Entry(*(entry[:-4] + (action, start_offset, length_offset,
|
||||
text_record_length)))
|
||||
|
||||
def populate_strand(parent, entries):
|
||||
ans = [parent]
|
||||
children = [c for c in entries if c.parent == parent.index]
|
||||
if children:
|
||||
# Add first child to this strand, and recurse downwards
|
||||
child = children[0]
|
||||
entries.remove(child)
|
||||
ans += populate_strand(child, entries)
|
||||
else:
|
||||
# Add any entries at the same depth that form a contiguous set of
|
||||
# indices and belong to the same parent (these can all be
|
||||
# represented as a single sequence with the 0b100 flag)
|
||||
current_index = parent.index
|
||||
siblings = []
|
||||
for entry in list(entries):
|
||||
if (entry.depth == parent.depth and entry.parent == parent.parent
|
||||
and entry.index == current_index+1):
|
||||
current_index += 1
|
||||
entries.remove(entry)
|
||||
children = [c for c in entries if c.parent == entry.index]
|
||||
if children:
|
||||
siblings += populate_strand(entry, entries)
|
||||
break # Cannot add more siblings, as we have added children
|
||||
else:
|
||||
siblings.append(entry)
|
||||
ans += siblings
|
||||
return ans
|
||||
|
||||
def separate_strands(entries):
|
||||
ans = []
|
||||
while entries:
|
||||
top, entries = entries[0], entries[1:]
|
||||
strand = populate_strand(top, entries)
|
||||
layers = OrderedDict()
|
||||
for entry in strand:
|
||||
if entry.depth not in layers:
|
||||
layers[entry.depth] = []
|
||||
layers[entry.depth].append(entry)
|
||||
ans.append(layers)
|
||||
return ans
|
||||
|
||||
def collect_indexing_data(entries, text_record_lengths):
|
||||
''' For every text record calculate which index entries start, end, span or
|
||||
are contained within that record.'''
|
||||
are contained within that record. Arrange these entries in 'strands'. '''
|
||||
|
||||
data = []
|
||||
for i in xrange(number_of_text_records):
|
||||
record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE
|
||||
datum = Data([], [], [], [])
|
||||
data.append(datum)
|
||||
entries = sorted(entries, key=attrgetter('start'))
|
||||
record_start = 0
|
||||
for rec_length in text_record_lengths:
|
||||
next_record_start = record_start + rec_length
|
||||
local_entries = []
|
||||
|
||||
for entry in entries:
|
||||
end = entry.start + entry.length - 1
|
||||
if (entry.start >= next_record_start or end < record_start):
|
||||
# This entry does not have any overlap with this record
|
||||
if entry.start >= next_record_start:
|
||||
# No more entries overlap this record
|
||||
break
|
||||
if entry.start + entry.length <= record_start:
|
||||
# This entry does not touch this record
|
||||
continue
|
||||
if (entry.start < record_start and end >= next_record_start):
|
||||
# This entry spans this record
|
||||
datum.spans.append(entry)
|
||||
continue
|
||||
if (entry.start >= record_start and end < next_record_start):
|
||||
# This entry is contained in this record
|
||||
datum.completes.append(entry)
|
||||
if (entry.start >= record_start and end >= next_record_start):
|
||||
# This entry starts in this record
|
||||
datum.starts.append(entry)
|
||||
continue
|
||||
if (entry.start < record_start and end < next_record_start):
|
||||
# This entry ends in this record
|
||||
datum.ends.append(entry)
|
||||
local_entries.append(fill_entry(entry, entry.start - record_start,
|
||||
rec_length))
|
||||
|
||||
for x in datum:
|
||||
# Should be unnecessary as entries are already in this order, but
|
||||
# best to be safe.
|
||||
x.sort(key=lambda x:x.depth)
|
||||
strands = separate_strands(local_entries)
|
||||
data.append(strands)
|
||||
record_start += rec_length
|
||||
|
||||
return data
|
||||
|
||||
def generate_tbs_for_flat_index(indexing_data):
|
||||
def encode_strands_as_sequences(strands, tbs_type=8):
|
||||
''' Encode the list of strands for a single text record into a list of
|
||||
sequences, ready to be converted into TBS bytes. '''
|
||||
ans = []
|
||||
record_type = 8 # 8 for KF8 0 for MOBI 6
|
||||
enc = partial(encode_tbs, flag_size=3)
|
||||
for datum in indexing_data:
|
||||
tbs = b''
|
||||
extra = {0b010 : record_type}
|
||||
if not (datum.starts or datum.ends or datum.completes or datum.spans):
|
||||
# No index entry touches this record
|
||||
pass
|
||||
elif datum.spans:
|
||||
extra[0b001] = 0
|
||||
tbs = enc(datum.spans[0].index, extra)
|
||||
else:
|
||||
starts, ends, completes = datum[:3]
|
||||
if (not completes and len(starts) + len(ends) == 1):
|
||||
# Either has the first or the last index, and no other indices.
|
||||
node = (starts+ends)[0]
|
||||
tbs = enc(node.index, extra)
|
||||
else:
|
||||
# This record contains the end of an index and
|
||||
# some complete index entries. Or it contains some complete
|
||||
# entries and a start. Or it contains an end, a start and
|
||||
# optionally some completes. In every case, we encode the first
|
||||
# entry to touch this record and the number of entries
|
||||
# that touch this record.
|
||||
nodes = starts + completes + ends
|
||||
nodes.sort(key=lambda x:x.index)
|
||||
extra[0b100] = len(nodes)
|
||||
tbs = enc(nodes[0].index, extra)
|
||||
ans.append(tbs)
|
||||
last_index = None
|
||||
max_length_offset = 0
|
||||
first_entry = None
|
||||
for strand in strands:
|
||||
for entries in strand.itervalues():
|
||||
for entry in entries:
|
||||
if first_entry is None:
|
||||
first_entry = entry
|
||||
if entry.length_offset > max_length_offset:
|
||||
max_length_offset = entry.length_offset
|
||||
|
||||
for strand in strands:
|
||||
strand_seqs = []
|
||||
for depth, entries in strand.iteritems():
|
||||
extra = {}
|
||||
if entries[-1].action == 'spans':
|
||||
extra[0b1] = 0
|
||||
elif False and (
|
||||
entries[-1].length_offset < entries[-1].text_record_length and
|
||||
entries[-1].action == 'completes' and
|
||||
entries[-1].length_offset != max_length_offset):
|
||||
# I can't figure out exactly when kindlegen decides to insert
|
||||
# this
|
||||
extra[0b1] = entries[-1].length_offset
|
||||
|
||||
if entries[0] is first_entry:
|
||||
extra[0b10] = tbs_type
|
||||
|
||||
if len(entries) > 1:
|
||||
extra[0b100] = len(entries)
|
||||
|
||||
index = entries[0].index - (entries[0].parent or 0)
|
||||
if ans and not strand_seqs:
|
||||
extra[0b1000] = True
|
||||
index = last_index - entries[0].index
|
||||
last_index = entries[-1].index
|
||||
strand_seqs.append((index, extra))
|
||||
|
||||
# Handle the case of consecutive action == 'spans' entries
|
||||
for i, seq in enumerate(strand_seqs):
|
||||
if i + 1 < len(strand_seqs):
|
||||
if 0b1 in seq[1] and 0b1 in strand_seqs[i+1][1]:
|
||||
del seq[1][0b1]
|
||||
ans.extend(strand_seqs)
|
||||
|
||||
return ans
|
||||
|
||||
def apply_trailing_byte_sequences(index_table, records, number_of_text_records):
|
||||
def sequences_to_bytes(sequences):
|
||||
ans = []
|
||||
flag_size = 3
|
||||
for val, extra in sequences:
|
||||
ans.append(encode_tbs(val, extra, flag_size))
|
||||
flag_size = 4
|
||||
return b''.join(ans)
|
||||
|
||||
def apply_trailing_byte_sequences(index_table, records, text_record_lengths):
|
||||
entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'],
|
||||
r.get('parent', None), r.get('first_child', None), r.get('last_child',
|
||||
None), r['label']) for r in index_table)
|
||||
None), r['label'], None, None, None, None) for r in index_table)
|
||||
|
||||
indexing_data = collect_indexing_data(entries, number_of_text_records)
|
||||
max_depth = max(e['depth'] for e in index_table)
|
||||
if max_depth > 0:
|
||||
# TODO: Implement for hierarchical ToCs
|
||||
tbs = []
|
||||
else:
|
||||
tbs = generate_tbs_for_flat_index(indexing_data)
|
||||
if not tbs:
|
||||
return False
|
||||
for i, tbs_bytes in enumerate(tbs):
|
||||
indexing_data = collect_indexing_data(entries, text_record_lengths)
|
||||
for i, strands in enumerate(indexing_data):
|
||||
sequences = encode_strands_as_sequences(strands)
|
||||
tbs_bytes = sequences_to_bytes(sequences)
|
||||
records[i+1] += encode_trailing_data(tbs_bytes)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user