mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8: Fully decode FDST records
This commit is contained in:
parent
5017ba10ca
commit
681d33416b
@ -7,10 +7,41 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import sys, os, imghdr
|
import sys, os, imghdr, struct
|
||||||
|
from itertools import izip
|
||||||
|
|
||||||
from calibre.ebooks.mobi.debug.headers import TextRecord
|
from calibre.ebooks.mobi.debug.headers import TextRecord
|
||||||
from calibre.ebooks.mobi.utils import read_font_record
|
from calibre.ebooks.mobi.utils import read_font_record
|
||||||
|
from calibre.ebooks.mobi.debug import format_bytes
|
||||||
|
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||||
|
|
||||||
|
class FDST(object):
|
||||||
|
|
||||||
|
def __init__(self, raw):
|
||||||
|
if raw[:4] != b'FDST':
|
||||||
|
raise ValueError('KF8 does not have a valid FDST record')
|
||||||
|
self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4)
|
||||||
|
if self.sec_off != 12:
|
||||||
|
raise ValueError('FDST record has unknown extra fields')
|
||||||
|
secf = b'>%dL' % (self.num_sections*2)
|
||||||
|
secs = struct.unpack_from(secf, raw, self.sec_off)
|
||||||
|
rest = raw[self.sec_off+struct.calcsize(secf):]
|
||||||
|
if rest:
|
||||||
|
raise ValueError('FDST record has trailing data: '
|
||||||
|
'%s'%format_bytes(rest))
|
||||||
|
self.sections = tuple(izip(secs[::2], secs[1::2]))
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
ans = ['FDST record']
|
||||||
|
a = lambda k, v:ans.append('%s: %s'%(k, v))
|
||||||
|
a('Offset to sections', self.sec_off)
|
||||||
|
a('Number of section records', self.num_sections)
|
||||||
|
ans.append('**** %d Sections ****'% len(self.sections))
|
||||||
|
for sec in self.sections:
|
||||||
|
ans.append('Start: %20d End: %d'%sec)
|
||||||
|
|
||||||
|
return '\n'.join(ans)
|
||||||
|
|
||||||
|
|
||||||
class MOBIFile(object):
|
class MOBIFile(object):
|
||||||
|
|
||||||
@ -31,7 +62,10 @@ class MOBIFile(object):
|
|||||||
first_text_record+offset+h8.number_of_text_records])]
|
first_text_record+offset+h8.number_of_text_records])]
|
||||||
|
|
||||||
self.raw_text = b''.join(r.raw for r in self.text_records)
|
self.raw_text = b''.join(r.raw for r in self.text_records)
|
||||||
|
self.header = self.mf.mobi8_header
|
||||||
|
self.kf8_records = mf.records[offset:]
|
||||||
self.extract_resources()
|
self.extract_resources()
|
||||||
|
self.read_fdst()
|
||||||
|
|
||||||
def print_header(self, f=sys.stdout):
|
def print_header(self, f=sys.stdout):
|
||||||
print (str(self.mf.palmdb).encode('utf-8'), file=f)
|
print (str(self.mf.palmdb).encode('utf-8'), file=f)
|
||||||
@ -43,6 +77,15 @@ class MOBIFile(object):
|
|||||||
print (file=f)
|
print (file=f)
|
||||||
print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
|
print (str(self.mf.mobi8_header).encode('utf-8'), file=f)
|
||||||
|
|
||||||
|
def read_fdst(self):
|
||||||
|
self.fdst = None
|
||||||
|
|
||||||
|
if self.header.fdst_idx != NULL_INDEX:
|
||||||
|
idx = self.header.fdst_idx
|
||||||
|
self.fdst = FDST(self.kf8_records[idx].raw)
|
||||||
|
if self.fdst.num_sections != self.header.fdst_count:
|
||||||
|
raise ValueError('KF8 Header contains invalid FDST count')
|
||||||
|
|
||||||
def extract_resources(self):
|
def extract_resources(self):
|
||||||
self.resource_map = []
|
self.resource_map = []
|
||||||
known_types = {b'FLIS', b'FCIS', b'SRCS',
|
known_types = {b'FLIS', b'FCIS', b'SRCS',
|
||||||
@ -96,7 +139,10 @@ def inspect_mobi(mobi_file, ddir):
|
|||||||
rec.dump(os.path.join(ddir, 'text_records'))
|
rec.dump(os.path.join(ddir, 'text_records'))
|
||||||
|
|
||||||
for href, payload in f.resource_map:
|
for href, payload in f.resource_map:
|
||||||
with open(os.path.join(ddir, href), 'wb') as f:
|
with open(os.path.join(ddir, href), 'wb') as fo:
|
||||||
f.write(payload)
|
fo.write(payload)
|
||||||
|
|
||||||
|
if f.fdst:
|
||||||
|
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
|
||||||
|
fo.write(str(f.fdst).encode('utf-8'))
|
||||||
|
|
||||||
|
@ -15,6 +15,12 @@ from calibre.ebooks.mobi.utils import (decint, count_set_bits,
|
|||||||
|
|
||||||
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
|
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
|
||||||
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
|
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
|
||||||
|
INDEX_HEADER_FIELDS = (
|
||||||
|
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||||
|
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||||
|
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
|
||||||
|
'ordt1', 'ordt2', 'tagx')
|
||||||
|
|
||||||
|
|
||||||
class InvalidFile(ValueError):
|
class InvalidFile(ValueError):
|
||||||
pass
|
pass
|
||||||
@ -36,11 +42,7 @@ def format_bytes(byts):
|
|||||||
|
|
||||||
def parse_indx_header(data):
|
def parse_indx_header(data):
|
||||||
check_signature(data, b'INDX')
|
check_signature(data, b'INDX')
|
||||||
words = (
|
words = INDEX_HEADER_FIELDS
|
||||||
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
|
||||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
|
||||||
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
|
|
||||||
'ordt1', 'ordt2', 'tagx')
|
|
||||||
num = len(words)
|
num = len(words)
|
||||||
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
||||||
ans = dict(zip(words, values))
|
ans = dict(zip(words, values))
|
||||||
|
@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import struct, re, os, imghdr
|
import struct, re, os, imghdr
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from itertools import repeat
|
from itertools import repeat, izip
|
||||||
from urlparse import urldefrag
|
from urlparse import urldefrag
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@ -71,16 +71,16 @@ class Mobi8Reader(object):
|
|||||||
return self.write_opf(guide, ncx, spine, resource_map)
|
return self.write_opf(guide, ncx, spine, resource_map)
|
||||||
|
|
||||||
def read_indices(self):
|
def read_indices(self):
|
||||||
self.flow_table = (0, NULL_INDEX)
|
self.flow_table = ()
|
||||||
|
|
||||||
if self.header.fdstidx != NULL_INDEX:
|
if self.header.fdstidx != NULL_INDEX:
|
||||||
header = self.kf8_sections[self.header.fdstidx][0]
|
header = self.kf8_sections[self.header.fdstidx][0]
|
||||||
if header[:4] != b'FDST':
|
if header[:4] != b'FDST':
|
||||||
raise ValueError('KF8 does not have a valid FDST record')
|
raise ValueError('KF8 does not have a valid FDST record')
|
||||||
num_sections, = struct.unpack_from(b'>L', header, 0x08)
|
sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
|
||||||
sections = header[0x0c:]
|
secs = struct.unpack_from(b'>%dL' % (num_sections*2),
|
||||||
self.flow_table = struct.unpack_from(b'>%dL' % (num_sections*2),
|
header, sec_start)
|
||||||
sections, 0)[::2] + (NULL_INDEX,)
|
self.flow_table = tuple(izip(secs[::2], secs[1::2]))
|
||||||
|
|
||||||
self.files = []
|
self.files = []
|
||||||
if self.header.skelidx != NULL_INDEX:
|
if self.header.skelidx != NULL_INDEX:
|
||||||
@ -127,13 +127,10 @@ class Mobi8Reader(object):
|
|||||||
raw_ml = self.mobi6_reader.mobi_html
|
raw_ml = self.mobi6_reader.mobi_html
|
||||||
self.flows = []
|
self.flows = []
|
||||||
self.flowinfo = []
|
self.flowinfo = []
|
||||||
|
ft = self.flow_table if self.flow_table else [(0, len(raw_ml))]
|
||||||
|
|
||||||
# now split the raw_ml into its flow pieces
|
# now split the raw_ml into its flow pieces
|
||||||
for j in xrange(0, len(self.flow_table)-1):
|
for start, end in ft:
|
||||||
start = self.flow_table[j]
|
|
||||||
end = self.flow_table[j+1]
|
|
||||||
if end == NULL_INDEX:
|
|
||||||
end = len(raw_ml)
|
|
||||||
self.flows.append(raw_ml[start:end])
|
self.flows.append(raw_ml[start:end])
|
||||||
|
|
||||||
# the first piece represents the xhtml text
|
# the first piece represents the xhtml text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user