KF8: Fully decode FDST records

This commit is contained in:
Kovid Goyal 2012-04-10 18:06:20 +05:30
parent 5017ba10ca
commit 681d33416b
3 changed files with 64 additions and 19 deletions

View File

@ -7,10 +7,41 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys, os, imghdr import sys, os, imghdr, struct
from itertools import izip
from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.utils import read_font_record from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
class FDST(object):
def __init__(self, raw):
if raw[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4)
if self.sec_off != 12:
raise ValueError('FDST record has unknown extra fields')
secf = b'>%dL' % (self.num_sections*2)
secs = struct.unpack_from(secf, raw, self.sec_off)
rest = raw[self.sec_off+struct.calcsize(secf):]
if rest:
raise ValueError('FDST record has trailing data: '
'%s'%format_bytes(rest))
self.sections = tuple(izip(secs[::2], secs[1::2]))
def __str__(self):
ans = ['FDST record']
a = lambda k, v:ans.append('%s: %s'%(k, v))
a('Offset to sections', self.sec_off)
a('Number of section records', self.num_sections)
ans.append('**** %d Sections ****'% len(self.sections))
for sec in self.sections:
ans.append('Start: %20d End: %d'%sec)
return '\n'.join(ans)
class MOBIFile(object): class MOBIFile(object):
@ -31,7 +62,10 @@ class MOBIFile(object):
first_text_record+offset+h8.number_of_text_records])] first_text_record+offset+h8.number_of_text_records])]
self.raw_text = b''.join(r.raw for r in self.text_records) self.raw_text = b''.join(r.raw for r in self.text_records)
self.header = self.mf.mobi8_header
self.kf8_records = mf.records[offset:]
self.extract_resources() self.extract_resources()
self.read_fdst()
def print_header(self, f=sys.stdout): def print_header(self, f=sys.stdout):
print (str(self.mf.palmdb).encode('utf-8'), file=f) print (str(self.mf.palmdb).encode('utf-8'), file=f)
@ -43,6 +77,15 @@ class MOBIFile(object):
print (file=f) print (file=f)
print (str(self.mf.mobi8_header).encode('utf-8'), file=f) print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
def read_fdst(self):
self.fdst = None
if self.header.fdst_idx != NULL_INDEX:
idx = self.header.fdst_idx
self.fdst = FDST(self.kf8_records[idx].raw)
if self.fdst.num_sections != self.header.fdst_count:
raise ValueError('KF8 Header contains invalid FDST count')
def extract_resources(self): def extract_resources(self):
self.resource_map = [] self.resource_map = []
known_types = {b'FLIS', b'FCIS', b'SRCS', known_types = {b'FLIS', b'FCIS', b'SRCS',
@ -96,7 +139,10 @@ def inspect_mobi(mobi_file, ddir):
rec.dump(os.path.join(ddir, 'text_records')) rec.dump(os.path.join(ddir, 'text_records'))
for href, payload in f.resource_map: for href, payload in f.resource_map:
with open(os.path.join(ddir, href), 'wb') as f: with open(os.path.join(ddir, href), 'wb') as fo:
f.write(payload) fo.write(payload)
if f.fdst:
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
fo.write(str(f.fdst).encode('utf-8'))

View File

@ -15,6 +15,12 @@ from calibre.ebooks.mobi.utils import (decint, count_set_bits,
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof') TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values') PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
INDEX_HEADER_FIELDS = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
class InvalidFile(ValueError): class InvalidFile(ValueError):
pass pass
@ -36,11 +42,7 @@ def format_bytes(byts):
def parse_indx_header(data): def parse_indx_header(data):
check_signature(data, b'INDX') check_signature(data, b'INDX')
words = ( words = INDEX_HEADER_FIELDS
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
num = len(words) num = len(words)
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)]) values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
ans = dict(zip(words, values)) ans = dict(zip(words, values))

View File

@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
import struct, re, os, imghdr import struct, re, os, imghdr
from collections import namedtuple from collections import namedtuple
from itertools import repeat from itertools import repeat, izip
from urlparse import urldefrag from urlparse import urldefrag
from lxml import etree from lxml import etree
@ -71,16 +71,16 @@ class Mobi8Reader(object):
return self.write_opf(guide, ncx, spine, resource_map) return self.write_opf(guide, ncx, spine, resource_map)
def read_indices(self): def read_indices(self):
self.flow_table = (0, NULL_INDEX) self.flow_table = ()
if self.header.fdstidx != NULL_INDEX: if self.header.fdstidx != NULL_INDEX:
header = self.kf8_sections[self.header.fdstidx][0] header = self.kf8_sections[self.header.fdstidx][0]
if header[:4] != b'FDST': if header[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record') raise ValueError('KF8 does not have a valid FDST record')
num_sections, = struct.unpack_from(b'>L', header, 0x08) sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
sections = header[0x0c:] secs = struct.unpack_from(b'>%dL' % (num_sections*2),
self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2), header, sec_start)
sections, 0)[::2] + (NULL_INDEX,) self.flow_table = tuple(izip(secs[::2], secs[1::2]))
self.files = [] self.files = []
if self.header.skelidx != NULL_INDEX: if self.header.skelidx != NULL_INDEX:
@ -127,13 +127,10 @@ class Mobi8Reader(object):
raw_ml = self.mobi6_reader.mobi_html raw_ml = self.mobi6_reader.mobi_html
self.flows = [] self.flows = []
self.flowinfo = [] self.flowinfo = []
ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
# now split the raw_ml into its flow pieces # now split the raw_ml into its flow pieces
for j in xrange(0, len(self.flow_table)-1): for start, end in ft:
start = self.flow_table[j]
end = self.flow_table[j+1]
if end == NULL_INDEX:
end = len(raw_ml)
self.flows.append(raw_ml[start:end]) self.flows.append(raw_ml[start:end])
# the first piece represents the xhtml text # the first piece represents the xhtml text