diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py new file mode 100644 index 0000000000..4b5e6fb315 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/index.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import OrderedDict, namedtuple + +from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header, + parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS) + +File = namedtuple('File', + 'file_number name divtbl_count start_position length') + +Elem = namedtuple('Elem', + 'insert_pos toc_text file_number sequence_number start_pos ' + 'length') + +def read_index(sections, idx, codec): + table, cncx = OrderedDict(), CNCX([], codec) + + data = sections[idx].raw + + indx_header = parse_indx_header(data) + indx_count = indx_header['count'] + + if indx_header['ncncx'] > 0: + off = idx + indx_count + 1 + cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]] + cncx = CNCX(cncx_records, codec) + + tag_section_start = indx_header['tagx'] + control_byte_count, tags = parse_tagx_section(data[tag_section_start:]) + + for i in xrange(idx + 1, idx + 1 + indx_count): + # Index record + data = sections[i].raw + parse_index_record(table, data, control_byte_count, tags, codec, + indx_header['ordt_map'], strict=True) + return table, cncx, indx_header + +class Index(object): + + def __init__(self, idx, records, codec): + self.table = self.cncx = self.header = self.records = None + if idx != NULL_INDEX: + self.table, self.cncx, self.header = read_index(records, idx, codec) + + def render(self): + ans = ['*'*10 + ' Index Header ' + '*'*10] + a = ans.append + if self.header is not None: + for field in INDEX_HEADER_FIELDS: + a('%-12s: %r'%(field, self.header[field])) + ans.extend(['', '']) + + if self.cncx: + a('*'*10 + ' CNCX ' + '*'*10) + for offset, val in self.cncx.iteritems(): + a('%10s: %s'%(offset, val)) + ans.extend(['', '']) + + if self.table is not None: + a('*'*10 + ' %d Index Entries '%len(self.table) + '*'*10) + for k, v in self.table.iteritems(): + a('%s: %r'%(k, v)) + + if self.records: + ans.extend(['', '', '*'*10 + ' Parsed Entries ' + '*'*10]) + for f in self.records: + a(repr(f)) + + return ans + [''] + + def __str__(self): + return '\n'.join(self.render()) + +class SKELIndex(Index): + + def __init__(self, skelidx, records, codec): + super(SKELIndex, self).__init__(skelidx, records, codec) + self.records = [] + + if self.table is not None: + for i, text in enumerate(self.table.iterkeys()): + tag_map = self.table[text] + if set(tag_map.iterkeys()) != {1, 6}: + raise ValueError('SKEL Index has unknown tags: %s'% + (set(tag_map.iterkeys())-{1,6})) + self.records.append(File( + i, # file_number + text, # name + tag_map[1][0], # divtbl_count + tag_map[6][0], # start_pos + tag_map[6][1]) # length + ) + +class SECTIndex(Index): + + def __init__(self, sectidx, records, codec): + super(SECTIndex, self).__init__(sectidx, records, codec) + self.records = [] + + if self.table is not None: + for i, text in enumerate(self.table.iterkeys()): + tag_map = self.table[text] + if set(tag_map.iterkeys()) != {2, 3, 4, 6}: + raise ValueError('SECT Index has unknown tags: %s'% + (set(tag_map.iterkeys())-{2, 3, 4, 6})) + + toc_text = self.cncx[tag_map[2][0]] + self.records.append(Elem( + int(text), # insert_pos + toc_text, # toc_text + tag_map[3][0], # file_number + tag_map[4][0], # sequence_number + tag_map[6][0], # start_pos + tag_map[6][1] # length + ) + ) + + diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 4e4fcfae57..c1e6221ca7 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -11,6 +11,7 @@ import sys, os, imghdr, struct from itertools import izip from calibre.ebooks.mobi.debug.headers import TextRecord +from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex) from calibre.ebooks.mobi.utils import read_font_record from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.reader.headers import NULL_INDEX @@ -65,6 +66,7 @@ class MOBIFile(object): self.header = self.mf.mobi8_header self.extract_resources() self.read_fdst() + self.read_indices() def print_header(self, f=sys.stdout): print (str(self.mf.palmdb).encode('utf-8'), file=f) @@ -85,6 +87,12 @@ class MOBIFile(object): if self.fdst.num_sections != self.header.fdst_count: raise ValueError('KF8 Header contains invalid FDST count') + def read_indices(self): + self.skel_index = SKELIndex(self.header.skel_idx, self.mf.records, + self.header.encoding) + self.sect_index = SECTIndex(self.header.sect_idx, self.mf.records, + self.header.encoding) + def extract_resources(self): self.resource_map = [] known_types = {b'FLIS', b'FCIS', b'SRCS', @@ -145,3 +153,9 @@ def inspect_mobi(mobi_file, ddir): with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo: fo.write(str(f.fdst).encode('utf-8')) + with open(os.path.join(ddir, 'skel.record'), 'wb') as fo: + fo.write(str(f.skel_index).encode('utf-8')) + + with open(os.path.join(ddir, 'sect.record'), 'wb') as fo: + fo.write(str(f.sect_index).encode('utf-8')) + diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py index 12a4ff8367..f5add94eac 100644 --- a/src/calibre/ebooks/mobi/reader/index.py +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -111,6 +111,12 @@ class CNCX(object): # {{{ def get(self, offset, default=None): return self.records.get(offset, default) + + def __bool__(self): + return bool(self.records) + + def iteritems(self): + return self.records.iteritems() # }}} def parse_tagx_section(data):