mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8: Fully decode FDST records
This commit is contained in:
parent
5017ba10ca
commit
681d33416b
@ -7,10 +7,41 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os, imghdr
|
||||
import sys, os, imghdr, struct
|
||||
from itertools import izip
|
||||
|
||||
from calibre.ebooks.mobi.debug.headers import TextRecord
|
||||
from calibre.ebooks.mobi.utils import read_font_record
|
||||
from calibre.ebooks.mobi.debug import format_bytes
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
|
||||
class FDST(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
if raw[:4] != b'FDST':
|
||||
raise ValueError('KF8 does not have a valid FDST record')
|
||||
self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4)
|
||||
if self.sec_off != 12:
|
||||
raise ValueError('FDST record has unknown extra fields')
|
||||
secf = b'>%dL' % (self.num_sections*2)
|
||||
secs = struct.unpack_from(secf, raw, self.sec_off)
|
||||
rest = raw[self.sec_off+struct.calcsize(secf):]
|
||||
if rest:
|
||||
raise ValueError('FDST record has trailing data: '
|
||||
'%s'%format_bytes(rest))
|
||||
self.sections = tuple(izip(secs[::2], secs[1::2]))
|
||||
|
||||
def __str__(self):
|
||||
ans = ['FDST record']
|
||||
a = lambda k, v:ans.append('%s: %s'%(k, v))
|
||||
a('Offset to sections', self.sec_off)
|
||||
a('Number of section records', self.num_sections)
|
||||
ans.append('**** %d Sections ****'% len(self.sections))
|
||||
for sec in self.sections:
|
||||
ans.append('Start: %20d End: %d'%sec)
|
||||
|
||||
return '\n'.join(ans)
|
||||
|
||||
|
||||
class MOBIFile(object):
|
||||
|
||||
@ -31,7 +62,10 @@ class MOBIFile(object):
|
||||
first_text_record+offset+h8.number_of_text_records])]
|
||||
|
||||
self.raw_text = b''.join(r.raw for r in self.text_records)
|
||||
self.header = self.mf.mobi8_header
|
||||
self.kf8_records = mf.records[offset:]
|
||||
self.extract_resources()
|
||||
self.read_fdst()
|
||||
|
||||
def print_header(self, f=sys.stdout):
|
||||
print (str(self.mf.palmdb).encode('utf-8'), file=f)
|
||||
@ -43,6 +77,15 @@ class MOBIFile(object):
|
||||
print (file=f)
|
||||
print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
|
||||
|
||||
def read_fdst(self):
|
||||
self.fdst = None
|
||||
|
||||
if self.header.fdst_idx != NULL_INDEX:
|
||||
idx = self.header.fdst_idx
|
||||
self.fdst = FDST(self.kf8_records[idx].raw)
|
||||
if self.fdst.num_sections != self.header.fdst_count:
|
||||
raise ValueError('KF8 Header contains invalid FDST count')
|
||||
|
||||
def extract_resources(self):
|
||||
self.resource_map = []
|
||||
known_types = {b'FLIS', b'FCIS', b'SRCS',
|
||||
@ -96,7 +139,10 @@ def inspect_mobi(mobi_file, ddir):
|
||||
rec.dump(os.path.join(ddir, 'text_records'))
|
||||
|
||||
for href, payload in f.resource_map:
|
||||
with open(os.path.join(ddir, href), 'wb') as f:
|
||||
f.write(payload)
|
||||
with open(os.path.join(ddir, href), 'wb') as fo:
|
||||
fo.write(payload)
|
||||
|
||||
if f.fdst:
|
||||
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
|
||||
fo.write(str(f.fdst).encode('utf-8'))
|
||||
|
||||
|
@ -15,6 +15,12 @@ from calibre.ebooks.mobi.utils import (decint, count_set_bits,
|
||||
|
||||
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
|
||||
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
|
||||
INDEX_HEADER_FIELDS = (
|
||||
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
|
||||
'ordt1', 'ordt2', 'tagx')
|
||||
|
||||
|
||||
class InvalidFile(ValueError):
|
||||
pass
|
||||
@ -36,11 +42,7 @@ def format_bytes(byts):
|
||||
|
||||
def parse_indx_header(data):
|
||||
check_signature(data, b'INDX')
|
||||
words = (
|
||||
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
|
||||
'ordt1', 'ordt2', 'tagx')
|
||||
words = INDEX_HEADER_FIELDS
|
||||
num = len(words)
|
||||
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
||||
ans = dict(zip(words, values))
|
||||
|
@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, re, os, imghdr
|
||||
from collections import namedtuple
|
||||
from itertools import repeat
|
||||
from itertools import repeat, izip
|
||||
from urlparse import urldefrag
|
||||
|
||||
from lxml import etree
|
||||
@ -71,16 +71,16 @@ class Mobi8Reader(object):
|
||||
return self.write_opf(guide, ncx, spine, resource_map)
|
||||
|
||||
def read_indices(self):
|
||||
self.flow_table = (0, NULL_INDEX)
|
||||
self.flow_table = ()
|
||||
|
||||
if self.header.fdstidx != NULL_INDEX:
|
||||
header = self.kf8_sections[self.header.fdstidx][0]
|
||||
if header[:4] != b'FDST':
|
||||
raise ValueError('KF8 does not have a valid FDST record')
|
||||
num_sections, = struct.unpack_from(b'>L', header, 0x08)
|
||||
sections = header[0x0c:]
|
||||
self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
|
||||
sections, 0)[::2] + (NULL_INDEX,)
|
||||
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
|
||||
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
|
||||
header, sec_start)
|
||||
self.flow_table = tuple(izip(secs[::2], secs[1::2]))
|
||||
|
||||
self.files = []
|
||||
if self.header.skelidx != NULL_INDEX:
|
||||
@ -127,13 +127,10 @@ class Mobi8Reader(object):
|
||||
raw_ml = self.mobi6_reader.mobi_html
|
||||
self.flows = []
|
||||
self.flowinfo = []
|
||||
ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
|
||||
|
||||
# now split the raw_ml into its flow pieces
|
||||
for j in xrange(0, len(self.flow_table)-1):
|
||||
start = self.flow_table[j]
|
||||
end = self.flow_table[j+1]
|
||||
if end == NULL_INDEX:
|
||||
end = len(raw_ml)
|
||||
for start, end in ft:
|
||||
self.flows.append(raw_ml[start:end])
|
||||
|
||||
# the first piece represents the xhtml text
|
||||
|
Loading…
x
Reference in New Issue
Block a user