Some progress on KF8 support in inspect MOBI

This commit is contained in:
Kovid Goyal 2012-03-18 12:15:37 +05:30
parent b6d02adfe3
commit 06f3a18684
4 changed files with 106 additions and 87 deletions

View File

@ -7,12 +7,13 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct, datetime import struct, datetime, os
from calibre.utils.date import utc_tz from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.utils import get_trailing_data
# PalmDB {{{ # PalmDB {{{
class PalmDOCAttributes(object): class PalmDOCAttributes(object):
@ -188,10 +189,13 @@ class EXTHHeader(object):
pos = self.read_record(pos) pos = self.read_record(pos)
self.records.sort(key=lambda x:x.type) self.records.sort(key=lambda x:x.type)
self.rmap = {x.type:x for x in self.records} self.rmap = {x.type:x for x in self.records}
self.get = self.rmap.get
def __getitem__(self, type_): def __getitem__(self, type_):
return self.rmap.__getitem__(type_) return self.rmap.__getitem__(type_).data
def get(self, type_, default=None):
ans = self.rmap.get(type_, default)
return getattr(ans, 'data', default)
def read_record(self, pos): def read_record(self, pos):
type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) type_, length = struct.unpack(b'>II', self.raw[pos:pos+8])
@ -201,7 +205,7 @@ class EXTHHeader(object):
@property @property
def kf8_header_index(self): def kf8_header_index(self):
return self.rmap.get(121, None) return self.get(121, None)
def __str__(self): def __str__(self):
ans = ['*'*20 + ' EXTH Header '+ '*'*20] ans = ['*'*20 + ' EXTH Header '+ '*'*20]
@ -263,9 +267,10 @@ class MOBIHeader(object): # {{{
}.get(self.encoding_raw, repr(self.encoding_raw)) }.get(self.encoding_raw, repr(self.encoding_raw))
self.uid = self.raw[32:36] self.uid = self.raw[32:36]
self.file_version, = struct.unpack(b'>I', self.raw[36:40]) self.file_version, = struct.unpack(b'>I', self.raw[36:40])
self.reserved = self.raw[40:48] self.meta_orth_indx, self.meta_infl_indx = struct.unpack(
b'>II', self.raw[40:48])
self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52])
self.reserved2 = self.raw[52:80] self.reserved = self.raw[52:80]
self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84])
self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88])
self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) self.fullname_length, = struct.unpack(b'>I', self.raw[88:92])
@ -299,9 +304,8 @@ class MOBIHeader(object): # {{{
self.extra_data_flags = 0 self.extra_data_flags = 0
if self.has_extra_data_flags: if self.has_extra_data_flags:
self.unknown4 = self.raw[180:192] self.unknown4 = self.raw[180:192]
self.first_content_record, self.last_content_record = \ self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II',
struct.unpack(b'>HH', self.raw[192:196]) self.raw, 192)
self.unknown5, = struct.unpack(b'>I', self.raw[196:200])
(self.fcis_number, self.fcis_count, self.flis_number, (self.fcis_number, self.fcis_count, self.flis_number,
self.flis_count) = struct.unpack(b'>IIII', self.flis_count) = struct.unpack(b'>IIII',
self.raw[200:216]) self.raw[200:216])
@ -320,10 +324,9 @@ class MOBIHeader(object): # {{{
self.raw[244:248]) self.raw[244:248])
if self.file_version >= 8: if self.file_version >= 8:
(self.unknown8, self.skel_idx, self.sect_idx, self.oth_idx, (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
self.fdst_idx, self.fdst_count) = struct.unpack_from( ) = struct.unpack_from(b'>4L', self.raw, 248)
b'>LLLLLL', self.raw, 248) self.unknown9 = self.raw[264:self.length]
self.unknown9 = self.raw[272:self.length]
if self.has_exth: if self.has_exth:
self.exth_offset = 16 + self.length self.exth_offset = 16 + self.length
@ -334,7 +337,7 @@ class MOBIHeader(object): # {{{
self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset]
def __str__(self): def __str__(self):
ans = ['*'*20 + ' MOBI Header '+ '*'*20] ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
a = ans.append a = ans.append
i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x)) i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x))
ans.append('Compression: %s'%self.compression) ans.append('Compression: %s'%self.compression)
@ -349,10 +352,11 @@ class MOBIHeader(object): # {{{
ans.append('Encoding: %s'%self.encoding) ans.append('Encoding: %s'%self.encoding)
ans.append('UID: %r'%self.uid) ans.append('UID: %r'%self.uid)
ans.append('File version: %d'%self.file_version) ans.append('File version: %d'%self.file_version)
ans.append('Reserved: %r'%self.reserved) ans.append('Meta Orth Index: %d'%self.meta_orth_indx)
ans.append('Meta Infl Index: %d'%self.meta_infl_indx)
ans.append('Secondary index record: %d (null val: %d)'%( ans.append('Secondary index record: %d (null val: %d)'%(
self.secondary_index_record, NULL_INDEX)) self.secondary_index_record, NULL_INDEX))
ans.append('Reserved2: %r'%self.reserved2) ans.append('Reserved: %r'%self.reserved)
ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX,
self.first_non_book_record)) self.first_non_book_record))
ans.append('Full name offset: %d'%self.fullname_offset) ans.append('Full name offset: %d'%self.fullname_offset)
@ -377,9 +381,8 @@ class MOBIHeader(object): # {{{
ans.append('DRM Flags: %r'%self.drm_flags) ans.append('DRM Flags: %r'%self.drm_flags)
if self.has_extra_data_flags: if self.has_extra_data_flags:
ans.append('Unknown4: %r'%self.unknown4) ans.append('Unknown4: %r'%self.unknown4)
ans.append('First content record: %d'% self.first_content_record) ans.append('FDST Index: %d'% self.fdst_idx)
ans.append('Last content record: %d'% self.last_content_record) ans.append('FDST Count: %d'% self.fdst_count)
ans.append('Unknown5: %d'% self.unknown5)
ans.append('FCIS number: %d'% self.fcis_number) ans.append('FCIS number: %d'% self.fcis_number)
ans.append('FCIS count: %d'% self.fcis_count) ans.append('FCIS count: %d'% self.fcis_count)
ans.append('FLIS number: %d'% self.flis_number) ans.append('FLIS number: %d'% self.flis_number)
@ -398,6 +401,7 @@ class MOBIHeader(object): # {{{
ans.append('Unknown8: %r'%self.unknown8) ans.append('Unknown8: %r'%self.unknown8)
i('SKEL Index', self.skel_idx) i('SKEL Index', self.skel_idx)
i('Sections Index', self.sect_idx) i('Sections Index', self.sect_idx)
i('Unknown8', self.unknown8)
i('Other Index', self.oth_idx) i('Other Index', self.oth_idx)
i('FDST record', self.fdst_idx) i('FDST record', self.fdst_idx)
a('FDST Count: %d'%self.fdst_count) a('FDST Count: %d'%self.fdst_count)
@ -447,28 +451,74 @@ class MOBIFile(object):
self.mobi_header = MOBIHeader(self.records[0]) self.mobi_header = MOBIHeader(self.records[0])
self.huffman_record_nums = [] self.huffman_record_nums = []
if 'huff' in self.mobi_header.compression.lower():
self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset,
self.mobi_header.huffman_record_offset +
self.mobi_header.huffman_record_count))
huffrecs = [self.records[r].raw for r in self.huffman_record_nums]
from calibre.ebooks.mobi.huffcdic import HuffReader
huffs = HuffReader(huffrecs)
decompress = huffs.unpack
elif 'palmdoc' in self.mobi_header.compression.lower():
from calibre.ebooks.compression.palmdoc import decompress_doc
decompress = decompress_doc
else:
decompress = lambda x: x
self.decompress = decompress
self.kf8_type = None self.kf8_type = None
mh = self.mobi_header mh = mh8 = self.mobi_header
if mh.file_version >= 8: if mh.file_version >= 8:
self.kf8_type = 'standalone' self.kf8_type = 'standalone'
elif mh.has_exth and mh.exth.kf8_header_index is not None: elif mh.has_exth and mh.exth.kf8_header_index is not None:
self.kf8_type = 'joint' self.kf8_type = 'joint'
kf8i = mh.exth.kf8_header_index
mh8 = MOBIHeader(self.records[kf8i])
self.mobi8_header = mh8
if 'huff' in self.mobi_header.compression.lower():
from calibre.ebooks.mobi.huffcdic import HuffReader
def huffit(off, cnt):
huffman_record_nums = list(xrange(off, off+cnt))
huffrecs = [self.records[r].raw for r in huffman_record_nums]
huffs = HuffReader(huffrecs)
return huffman_record_nums, huffs.unpack
if self.kf8_type == 'joint':
recs6, d6 = huffit(mh.huffman_record_offset,
mh.huffman_record_count)
recs8, d8 = huffit(mh8.huffman_record_offset + kf8i,
mh8.huffman_record_count)
self.huffman_record_nums = recs6 + recs8
else:
self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset,
mh.huffman_record_count)
d8 = d6
elif 'palmdoc' in self.mobi_header.compression.lower():
from calibre.ebooks.compression.palmdoc import decompress_doc
d8 = d6 = decompress_doc
else:
d8 = d6 = lambda x: x
self.decompress6, self.decompress8 = d6, d8
class TextRecord(object): # {{{
def __init__(self, idx, record, extra_data_flags, decompress):
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
raw_trailing_bytes = record.raw[len(self.raw):]
self.raw = decompress(self.raw)
if 0 in self.trailing_data:
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
if 1 in self.trailing_data:
self.trailing_data['indexing'] = self.trailing_data.pop(1)
if 2 in self.trailing_data:
self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
self.trailing_data['raw_bytes'] = raw_trailing_bytes
for typ, val in self.trailing_data.iteritems():
if isinstance(typ, int):
print ('Record %d has unknown trailing data of type: %d : %r'%
(idx, typ, val))
self.idx = idx
def dump(self, folder):
name = '%06d'%self.idx
with open(os.path.join(folder, name+'.txt'), 'wb') as f:
f.write(self.raw)
with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f:
for k, v in self.trailing_data.iteritems():
raw = '%s : %r\n\n'%(k, v)
f.write(raw.encode('utf-8'))
# }}}

View File

@ -11,6 +11,7 @@ import sys, os, shutil
from calibre.ebooks.mobi.debug.headers import MOBIFile from calibre.ebooks.mobi.debug.headers import MOBIFile
from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6 from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6
from calibre.ebooks.mobi.debug.mobi8 import inspect_mobi as inspect_mobi8
def inspect_mobi(path_or_stream, ddir=None): # {{{ def inspect_mobi(path_or_stream, ddir=None): # {{{
stream = (path_or_stream if hasattr(path_or_stream, 'read') else stream = (path_or_stream if hasattr(path_or_stream, 'read') else
@ -27,7 +28,15 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{
inspect_mobi6(f, ddir) inspect_mobi6(f, ddir)
elif f.kf8_type == 'joint': elif f.kf8_type == 'joint':
p6 = os.path.join(ddir, 'mobi6') p6 = os.path.join(ddir, 'mobi6')
os.mkdir(p6)
inspect_mobi6(f, p6) inspect_mobi6(f, p6)
p8 = os.path.join(ddir, 'mobi8')
os.mkdir(p8)
inspect_mobi8(f, p8)
else:
inspect_mobi8(f, ddir)
print ('Debug data saved to:', ddir)
# }}} # }}}

View File

@ -16,9 +16,10 @@ from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import (parse_index_record, from calibre.ebooks.mobi.reader.index import (parse_index_record,
parse_tagx_section) parse_tagx_section)
from calibre.ebooks.mobi.utils import (decode_hex_number, decint, from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
get_trailing_data, decode_tbs, read_font_record) decode_tbs, read_font_record)
from calibre.utils.magick.draw import identify_data from calibre.utils.magick.draw import identify_data
from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.debug.headers import TextRecord
class TagX(object): # {{{ class TagX(object): # {{{
@ -472,39 +473,6 @@ class CNCX(object): # {{{
# }}} # }}}
class TextRecord(object): # {{{
def __init__(self, idx, record, extra_data_flags, decompress):
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
raw_trailing_bytes = record.raw[len(self.raw):]
self.raw = decompress(self.raw)
if 0 in self.trailing_data:
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
if 1 in self.trailing_data:
self.trailing_data['indexing'] = self.trailing_data.pop(1)
if 2 in self.trailing_data:
self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
self.trailing_data['raw_bytes'] = raw_trailing_bytes
for typ, val in self.trailing_data.iteritems():
if isinstance(typ, int):
print ('Record %d has unknown trailing data of type: %d : %r'%
(idx, typ, val))
self.idx = idx
def dump(self, folder):
name = '%06d'%self.idx
with open(os.path.join(folder, name+'.txt'), 'wb') as f:
f.write(self.raw)
with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f:
for k, v in self.trailing_data.iteritems():
raw = '%s : %r\n\n'%(k, v)
f.write(raw.encode('utf-8'))
# }}}
class ImageRecord(object): # {{{ class ImageRecord(object): # {{{
def __init__(self, idx, record, fmt): def __init__(self, idx, record, fmt):
@ -781,7 +749,7 @@ class MOBIFile(object): # {{{
if fntbr == NULL_INDEX: if fntbr == NULL_INDEX:
fntbr = len(self.records) fntbr = len(self.records)
self.text_records = [TextRecord(r, self.records[r], self.text_records = [TextRecord(r, self.records[r],
self.mobi_header.extra_data_flags, mf.decompress) for r in xrange(1, self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1,
min(len(self.records), ntr+1))] min(len(self.records), ntr+1))]
self.image_records, self.binary_records = [], [] self.image_records, self.binary_records = [], []
self.font_records = [] self.font_records = []
@ -833,13 +801,12 @@ def inspect_mobi(mobi_file, ddir):
of.write(rec.raw) of.write(rec.raw)
alltext += rec.raw alltext += rec.raw
of.seek(0) of.seek(0)
if f.mobi_header.file_version < 8:
root = html.fromstring(alltext.decode('utf-8')) root = html.fromstring(alltext.decode('utf-8'))
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
of.write(html.tostring(root, pretty_print=True, encoding='utf-8', of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
include_meta_content_type=True)) include_meta_content_type=True))
if f.index_header is not None: if f.index_header is not None:
f.index_record.alltext = alltext f.index_record.alltext = alltext
with open(os.path.join(ddir, 'index.txt'), 'wb') as out: with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
@ -866,7 +833,6 @@ def inspect_mobi(mobi_file, ddir):
rec.dump(tdir) rec.dump(tdir)
print ('Debug data saved to:', ddir)
# }}} # }}}

View File

@ -187,19 +187,13 @@ class BookHeader(object):
self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)
if self.mobi_version >= 8: if self.mobi_version >= 8:
self.skelidx, = struct.unpack_from('>L', raw, 0xFC) self.dividx, self.skelidx, self.datpidx, self.othidx = \
struct.unpack_from(b'>4L', raw, 0xF8)
# Index into <div> sections in raw_ml
self.dividx, = struct.unpack_from('>L', raw, 0xF8)
# Index into Other files
self.othidx, = struct.unpack_from('>L', raw, 0x104)
# need to use the FDST record to find out how to properly # need to use the FDST record to find out how to properly
# unpack the raw_ml into pieces it is simply a table of start # unpack the raw_ml into pieces it is simply a table of start
# and end locations for each flow piece # and end locations for each flow piece
self.fdstidx, = struct.unpack_from('>L', raw, 0xC0) self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4)
# if cnt is 1 or less, fdst section number can be garbage # if cnt is 1 or less, fdst section number can be garbage
if self.fdstcnt <= 1: if self.fdstcnt <= 1:
self.fdstidx = NULL_INDEX self.fdstidx = NULL_INDEX