From 681d33416ba1a38b5d6b57cdb1654c4c22f03012 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 10 Apr 2012 18:06:20 +0530 Subject: [PATCH] KF8: Fully decode FDST records --- src/calibre/ebooks/mobi/debug/mobi8.py | 52 +++++++++++++++++++++++-- src/calibre/ebooks/mobi/reader/index.py | 12 +++--- src/calibre/ebooks/mobi/reader/mobi8.py | 19 ++++----- 3 files changed, 64 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 20fd419e29..b869b8b69b 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -7,10 +7,41 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, imghdr +import sys, os, imghdr, struct +from itertools import izip from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.utils import read_font_record +from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.reader.headers import NULL_INDEX + +class FDST(object): + + def __init__(self, raw): + if raw[:4] != b'FDST': + raise ValueError('KF8 does not have a valid FDST record') + self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4) + if self.sec_off != 12: + raise ValueError('FDST record has unknown extra fields') + secf = b'>%dL' % (self.num_sections*2) + secs = struct.unpack_from(secf, raw, self.sec_off) + rest = raw[self.sec_off+struct.calcsize(secf):] + if rest: + raise ValueError('FDST record has trailing data: ' + '%s'%format_bytes(rest)) + self.sections = tuple(izip(secs[::2], secs[1::2])) + + def __str__(self): + ans = ['FDST record'] + a = lambda k, v:ans.append('%s: %s'%(k, v)) + a('Offset to sections', self.sec_off) + a('Number of section records', self.num_sections) + ans.append('**** %d Sections ****'% len(self.sections)) + for sec in self.sections: + ans.append('Start: %20d End: %d'%sec) + + return '\n'.join(ans) + class MOBIFile(object): @@ -31,7 +62,10 @@ class MOBIFile(object): first_text_record+offset+h8.number_of_text_records])] self.raw_text = b''.join(r.raw for r in self.text_records) + self.header = self.mf.mobi8_header + self.kf8_records = mf.records[offset:] self.extract_resources() + self.read_fdst() def print_header(self, f=sys.stdout): print (str(self.mf.palmdb).encode('utf-8'), file=f) @@ -43,6 +77,15 @@ class MOBIFile(object): print (file=f) print (str(self.mf.mobi8_header).encode('utf-8'), file=f) + def read_fdst(self): + self.fdst = None + + if self.header.fdst_idx != NULL_INDEX: + idx = self.header.fdst_idx + self.fdst = FDST(self.kf8_records[idx].raw) + if self.fdst.num_sections != self.header.fdst_count: + raise ValueError('KF8 Header contains invalid FDST count') + def extract_resources(self): self.resource_map = [] known_types = {b'FLIS', b'FCIS', b'SRCS', @@ -96,7 +139,10 @@ def inspect_mobi(mobi_file, ddir): rec.dump(os.path.join(ddir, 'text_records')) for href, payload in f.resource_map: - with open(os.path.join(ddir, href), 'wb') as f: - f.write(payload) + with open(os.path.join(ddir, href), 'wb') as fo: + fo.write(payload) + if f.fdst: + with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo: + fo.write(str(f.fdst).encode('utf-8')) diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py index d8a88227c8..12a4ff8367 100644 --- a/src/calibre/ebooks/mobi/reader/index.py +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -15,6 +15,12 @@ from calibre.ebooks.mobi.utils import (decint, count_set_bits, TagX = namedtuple('TagX', 'tag num_of_values bitmask eof') PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values') +INDEX_HEADER_FIELDS = ( + 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', + 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' + ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries', + 'ordt1', 'ordt2', 'tagx') + class InvalidFile(ValueError): pass @@ -36,11 +42,7 @@ def format_bytes(byts): def parse_indx_header(data): check_signature(data, b'INDX') - words = ( - 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', - 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' - ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries', - 'ordt1', 'ordt2', 'tagx') + words = INDEX_HEADER_FIELDS num = len(words) values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)]) ans = dict(zip(words, values)) diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 0ca5341780..bf068eb498 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en' import struct, re, os, imghdr from collections import namedtuple -from itertools import repeat +from itertools import repeat, izip from urlparse import urldefrag from lxml import etree @@ -71,16 +71,16 @@ class Mobi8Reader(object): return self.write_opf(guide, ncx, spine, resource_map) def read_indices(self): - self.flow_table = (0, NULL_INDEX) + self.flow_table = () if self.header.fdstidx != NULL_INDEX: header = self.kf8_sections[self.header.fdstidx][0] if header[:4] != b'FDST': raise ValueError('KF8 does not have a valid FDST record') - num_sections, = struct.unpack_from(b'>L', header, 0x08) - sections = header[0x0c:] - self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2), - sections, 0)[::2] + (NULL_INDEX,) + sec_start, num_sections = struct.unpack_from(b'>LL', header, 4) + secs = struct.unpack_from(b'>%dL' % (num_sections*2), + header, sec_start) + self.flow_table = tuple(izip(secs[::2], secs[1::2])) self.files = [] if self.header.skelidx != NULL_INDEX: @@ -127,13 +127,10 @@ class Mobi8Reader(object): raw_ml = self.mobi6_reader.mobi_html self.flows = [] self.flowinfo = [] + ft = self.flow_table if self.flow_table else [(0, len(raw_ml))] # now split the raw_ml into its flow pieces - for j in xrange(0, len(self.flow_table)-1): - start = self.flow_table[j] - end = self.flow_table[j+1] - if end == NULL_INDEX: - end = len(raw_ml) + for start, end in ft: self.flows.append(raw_ml[start:end]) # the first piece represents the xhtml text