KF8 Output: Generate trailing bytes for books with flat ToCs

This commit is contained in:
Kovid Goyal 2012-04-26 10:30:25 +05:30
parent 669fc85958
commit a883b577a6
6 changed files with 167 additions and 3 deletions

View File

@ -82,6 +82,9 @@ class Index(object):
def __str__(self): def __str__(self):
return '\n'.join(self.render()) return '\n'.join(self.render())
def __iter__(self):
return iter(self.records)
class SKELIndex(Index): class SKELIndex(Index):
def __init__(self, skelidx, records, codec): def __init__(self, skelidx, records, codec):

View File

@ -14,7 +14,7 @@ from calibre import CurrentDir
from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
GuideIndex) GuideIndex)
from calibre.ebooks.mobi.utils import read_font_record from calibre.ebooks.mobi.utils import read_font_record, decode_tbs
from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.headers import NULL_INDEX
@ -88,6 +88,7 @@ class MOBIFile(object):
self.read_fdst() self.read_fdst()
self.read_indices() self.read_indices()
self.build_files() self.build_files()
self.read_tbs()
def print_header(self, f=sys.stdout): def print_header(self, f=sys.stdout):
print (str(self.mf.palmdb).encode('utf-8'), file=f) print (str(self.mf.palmdb).encode('utf-8'), file=f)
@ -183,6 +184,45 @@ class MOBIFile(object):
self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext), self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext),
payload)) payload))
def read_tbs(self):
from calibre.ebooks.mobi.writer8.tbs import (Entry,
collect_indexing_data)
entry_map = []
for index in self.ncx_index:
enders = [e['pos'] for e in self.ncx_index if e['pos'] >
index['pos'] and
e['hlvl'] <= index['hlvl']]
end = min(enders+[len(self.raw_text)])
entry_map.append(Entry(index=index['num'], title=index['text'],
depth=index['hlvl'],
parent=index['parent'] if index['parent'] > -1 else None,
first_child=index['child1'] if index['child1'] > -1 else None,
last_child=index['childn'] if index['childn'] > -1 else None,
start=index['pos'], length=end-index['pos']))
indexing_data = collect_indexing_data(entry_map,
len(self.text_records))
self.indexing_data = []
for i, data in enumerate(indexing_data):
rec = self.text_records[i]
tbs_bytes = rec.trailing_data.get('indexing', b'')
desc = ['Record #%d'%i]
for x in ('starts', 'completes', 'ends', 'spans'):
points = ['\t%d at depth: %d'%(e.index, e.depth) for e in
getattr(data, x)]
if points:
desc.append(x+':')
desc.extend(points)
desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
val, extra, consumed = decode_tbs(tbs_bytes, flag_size=3)
extra = {bin(k):v for k, v in extra.iteritems()}
desc.append('First sequence: %r %r'%(val, extra))
byts = tbs_bytes[consumed:]
if byts:
desc.append('Remaining bytes: %s'%format_bytes(byts))
desc.append('')
self.indexing_data.append('\n'.join(desc))
def inspect_mobi(mobi_file, ddir): def inspect_mobi(mobi_file, ddir):
f = MOBIFile(mobi_file) f = MOBIFile(mobi_file)
@ -193,7 +233,8 @@ def inspect_mobi(mobi_file, ddir):
with open(alltext, 'wb') as of: with open(alltext, 'wb') as of:
of.write(f.raw_text) of.write(f.raw_text)
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'): for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows',
'tbs'):
os.mkdir(os.path.join(ddir, x)) os.mkdir(os.path.join(ddir, x))
for rec in f.text_records: for rec in f.text_records:
@ -219,6 +260,8 @@ def inspect_mobi(mobi_file, ddir):
with open(os.path.join(ddir, 'guide.record'), 'wb') as fo: with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
fo.write(str(f.guide_index).encode('utf-8')) fo.write(str(f.guide_index).encode('utf-8'))
with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo:
fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
for part in f.files: for part in f.files:
part.dump(os.path.join(ddir, 'files')) part.dump(os.path.join(ddir, 'files'))

View File

@ -397,7 +397,10 @@ class MobiWriter(object):
header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
header_fields['fdst_record'] = NULL_INDEX header_fields['fdst_record'] = NULL_INDEX
header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1 header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
header_fields['extra_data_flags'] = 0b11 extra_data_flags = 0b1 # Has multibyte overlap bytes
if self.primary_index_record_idx is not None:
extra_data_flags |= 0b10
header_fields['extra_data_flags'] = extra_data_flags
for k, v in {'last_text_record':'last_text_record_idx', for k, v in {'last_text_record':'last_text_record_idx',
'first_non_text_record':'first_non_text_record_idx', 'first_non_text_record':'first_non_text_record_idx',

View File

@ -27,6 +27,7 @@ from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex, from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
ChunkIndex, GuideIndex) ChunkIndex, GuideIndex)
from calibre.ebooks.mobi.writer8.mobi import KF8Book from calibre.ebooks.mobi.writer8.mobi import KF8Book
from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences
XML_DOCS = OEB_DOCS | {SVG_MIME} XML_DOCS = OEB_DOCS | {SVG_MIME}
@ -39,6 +40,7 @@ class KF8Writer(object):
def __init__(self, oeb, opts, resources): def __init__(self, oeb, opts, resources):
self.oeb, self.opts, self.log = oeb, opts, oeb.log self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.compress = not self.opts.dont_compress self.compress = not self.opts.dont_compress
self.has_tbs = False
self.log.info('Creating KF8 output') self.log.info('Creating KF8 output')
self.used_images = set() self.used_images = set()
self.resources = resources self.resources = resources
@ -363,6 +365,8 @@ class KF8Writer(object):
for entry in entries: for entry in entries:
entry['length'] = get_next_start(entry) - entry['offset'] entry['length'] = get_next_start(entry) - entry['offset']
self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
self.last_text_record_idx+1)
self.ncx_records = NCXIndex(entries)() self.ncx_records = NCXIndex(entries)()
def create_guide(self): def create_guide(self):

View File

@ -250,6 +250,8 @@ class KF8Book(object):
self.full_title = utf8_text(unicode(metadata.title[0])) self.full_title = utf8_text(unicode(metadata.title[0]))
self.title_length = len(self.full_title) self.title_length = len(self.full_title)
self.extra_data_flags = 0b1 self.extra_data_flags = 0b1
if writer.has_tbs:
self.extra_data_flags |= 0b10
self.uid = random.randint(0, 0xffffffff) self.uid = random.randint(0, 0xffffffff)
self.language_code = iana2mobi(str(metadata.language[0])) self.language_code = iana2mobi(str(metadata.language[0]))

View File

@ -0,0 +1,109 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import namedtuple
from functools import partial
from calibre.ebooks.mobi.utils import (RECORD_SIZE, encode_trailing_data,
encode_tbs)
Entry = namedtuple('IndexEntry', 'index start length depth parent '
'first_child last_child title')
Data = namedtuple('Data', 'starts ends completes spans')
def collect_indexing_data(entries, number_of_text_records):
''' For every text record calculate which index entries start, end, span or
are contained within that record.'''
data = []
for i in xrange(number_of_text_records):
record_start, next_record_start = i*RECORD_SIZE, (i+1)*RECORD_SIZE
datum = Data([], [], [], [])
data.append(datum)
for entry in entries:
end = entry.start + entry.length - 1
if (entry.start >= next_record_start or end < record_start):
# This entry does not have any overlap with this record
continue
if (entry.start < record_start and end >= next_record_start):
# This entry spans this record
datum.spans.append(entry)
continue
if (entry.start >= record_start and end < next_record_start):
# This entry is contained in this record
datum.completes.append(entry)
if (entry.start >= record_start and end >= next_record_start):
# This entry starts in this record
datum.starts.append(entry)
continue
if (entry.start < record_start and end < next_record_start):
# This entry ends in this record
datum.ends.append(entry)
for x in datum:
# Should be unnecessary as entries are already in this order, but
# best to be safe.
x.sort(key=lambda x:x.depth)
return data
def generate_tbs_for_flat_index(indexing_data):
ans = []
record_type = 8 # 8 for KF8 0 for MOBI 6
enc = partial(encode_tbs, flag_size=3)
for datum in indexing_data:
tbs = b''
extra = {0b010 : record_type}
if not (datum.starts or datum.ends or datum.completes or datum.spans):
# No index entry touches this record
pass
elif datum.spans:
extra[0b001] = 0
tbs = enc(datum.spans[0].index, extra)
else:
starts, ends, completes = datum[:3]
if (not completes and len(starts) + len(ends) == 1):
# Either has the first or the last index, and no other indices.
node = (starts+ends)[0]
tbs = enc(node.index, extra)
else:
# This record contains the end of an index and
# some complete index entries. Or it contains some complete
# entries and a start. Or it contains an end, a start and
# optionally some completes. In every case, we encode the first
# entry to touch this record and the number of entries
# that touch this record.
nodes = starts + completes + ends
nodes.sort(key=lambda x:x.index)
extra[0b100] = len(nodes)
tbs = enc(nodes[0].index, extra)
ans.append(tbs)
return ans
def apply_trailing_byte_sequences(index_table, records, number_of_text_records):
entries = tuple(Entry(r['index'], r['offset'], r['length'], r['depth'],
r.get('parent', None), r.get('first_child', None), r.get('last_child',
None), r['label']) for r in index_table)
indexing_data = collect_indexing_data(entries, number_of_text_records)
max_depth = max(e['depth'] for e in index_table)
if max_depth > 0:
# TODO: Implement for hierarchical ToCs
tbs = []
else:
tbs = generate_tbs_for_flat_index(indexing_data)
if not tbs:
return False
for i, tbs_bytes in enumerate(tbs):
records[i+1] += encode_trailing_data(tbs_bytes)
return True