KF8: Fully decode FDST records

This commit is contained in:
Kovid Goyal 2012-04-10 18:06:20 +05:30
parent 5017ba10ca
commit 681d33416b
3 changed files with 64 additions and 19 deletions

View File

@ -7,10 +7,41 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, imghdr
import sys, os, imghdr, struct
from itertools import izip
from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
class FDST(object):
def __init__(self, raw):
if raw[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4)
if self.sec_off != 12:
raise ValueError('FDST record has unknown extra fields')
secf = b'>%dL' % (self.num_sections*2)
secs = struct.unpack_from(secf, raw, self.sec_off)
rest = raw[self.sec_off+struct.calcsize(secf):]
if rest:
raise ValueError('FDST record has trailing data: '
'%s'%format_bytes(rest))
self.sections = tuple(izip(secs[::2], secs[1::2]))
def __str__(self):
ans = ['FDST record']
a = lambda k, v:ans.append('%s: %s'%(k, v))
a('Offset to sections', self.sec_off)
a('Number of section records', self.num_sections)
ans.append('**** %d Sections ****'% len(self.sections))
for sec in self.sections:
ans.append('Start: %20d End: %d'%sec)
return '\n'.join(ans)
class MOBIFile(object):
@ -31,7 +62,10 @@ class MOBIFile(object):
first_text_record+offset+h8.number_of_text_records])]
self.raw_text = b''.join(r.raw for r in self.text_records)
self.header = self.mf.mobi8_header
self.kf8_records = mf.records[offset:]
self.extract_resources()
self.read_fdst()
def print_header(self, f=sys.stdout):
print (str(self.mf.palmdb).encode('utf-8'), file=f)
@ -43,6 +77,15 @@ class MOBIFile(object):
print (file=f)
print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
def read_fdst(self):
self.fdst = None
if self.header.fdst_idx != NULL_INDEX:
idx = self.header.fdst_idx
self.fdst = FDST(self.kf8_records[idx].raw)
if self.fdst.num_sections != self.header.fdst_count:
raise ValueError('KF8 Header contains invalid FDST count')
def extract_resources(self):
self.resource_map = []
known_types = {b'FLIS', b'FCIS', b'SRCS',
@ -96,7 +139,10 @@ def inspect_mobi(mobi_file, ddir):
rec.dump(os.path.join(ddir, 'text_records'))
for href, payload in f.resource_map:
with open(os.path.join(ddir, href), 'wb') as f:
f.write(payload)
with open(os.path.join(ddir, href), 'wb') as fo:
fo.write(payload)
if f.fdst:
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
fo.write(str(f.fdst).encode('utf-8'))

View File

@ -15,6 +15,12 @@ from calibre.ebooks.mobi.utils import (decint, count_set_bits,
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
INDEX_HEADER_FIELDS = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
class InvalidFile(ValueError):
pass
@ -36,11 +42,7 @@ def format_bytes(byts):
def parse_indx_header(data):
check_signature(data, b'INDX')
words = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
words = INDEX_HEADER_FIELDS
num = len(words)
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
ans = dict(zip(words, values))

View File

@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
import struct, re, os, imghdr
from collections import namedtuple
from itertools import repeat
from itertools import repeat, izip
from urlparse import urldefrag
from lxml import etree
@ -71,16 +71,16 @@ class Mobi8Reader(object):
return self.write_opf(guide, ncx, spine, resource_map)
def read_indices(self):
self.flow_table = (0, NULL_INDEX)
self.flow_table = ()
if self.header.fdstidx != NULL_INDEX:
header = self.kf8_sections[self.header.fdstidx][0]
if header[:4] != b'FDST':
raise ValueError('KF8 does not have a valid FDST record')
num_sections, = struct.unpack_from(b'>L', header, 0x08)
sections = header[0x0c:]
self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
sections, 0)[::2] + (NULL_INDEX,)
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
header, sec_start)
self.flow_table = tuple(izip(secs[::2], secs[1::2]))
self.files = []
if self.header.skelidx != NULL_INDEX:
@ -127,13 +127,10 @@ class Mobi8Reader(object):
raw_ml = self.mobi6_reader.mobi_html
self.flows = []
self.flowinfo = []
ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
# now split the raw_ml into its flow pieces
for j in xrange(0, len(self.flow_table)-1):
start = self.flow_table[j]
end = self.flow_table[j+1]
if end == NULL_INDEX:
end = len(raw_ml)
for start, end in ft:
self.flows.append(raw_ml[start:end])
# the first piece represents the xhtml text