diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py index 7965253be6..06318c4527 100644 --- a/src/calibre/ebooks/mobi/debug/headers.py +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -7,12 +7,13 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime +import struct, datetime, os from calibre.utils.date import utc_tz from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.utils import get_trailing_data # PalmDB {{{ class PalmDOCAttributes(object): @@ -188,10 +189,13 @@ class EXTHHeader(object): pos = self.read_record(pos) self.records.sort(key=lambda x:x.type) self.rmap = {x.type:x for x in self.records} - self.get = self.rmap.get def __getitem__(self, type_): - return self.rmap.__getitem__(type_) + return self.rmap.__getitem__(type_).data + + def get(self, type_, default=None): + ans = self.rmap.get(type_, default) + return getattr(ans, 'data', default) def read_record(self, pos): type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) @@ -201,7 +205,7 @@ class EXTHHeader(object): @property def kf8_header_index(self): - return self.rmap.get(121, None) + return self.get(121, None) def __str__(self): ans = ['*'*20 + ' EXTH Header '+ '*'*20] @@ -263,9 +267,10 @@ class MOBIHeader(object): # {{{ }.get(self.encoding_raw, repr(self.encoding_raw)) self.uid = self.raw[32:36] self.file_version, = struct.unpack(b'>I', self.raw[36:40]) - self.reserved = self.raw[40:48] + self.meta_orth_indx, self.meta_infl_indx = struct.unpack( + b'>II', self.raw[40:48]) self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) - self.reserved2 = self.raw[52:80] + self.reserved = self.raw[52:80] self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) @@ -299,9 +304,8 @@ class MOBIHeader(object): # {{{ self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[180:192] - self.first_content_record, self.last_content_record = \ - struct.unpack(b'>HH', self.raw[192:196]) - self.unknown5, = struct.unpack(b'>I', self.raw[196:200]) + self.fdst_idx, self.fdst_count = struct.unpack_from(b'>II', + self.raw, 192) (self.fcis_number, self.fcis_count, self.flis_number, self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216]) @@ -320,10 +324,9 @@ class MOBIHeader(object): # {{{ self.raw[244:248]) if self.file_version >= 8: - (self.unknown8, self.skel_idx, self.sect_idx, self.oth_idx, - self.fdst_idx, self.fdst_count) = struct.unpack_from( - b'>LLLLLL', self.raw, 248) - self.unknown9 = self.raw[272:self.length] + (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx + ) = struct.unpack_from(b'>4L', self.raw, 248) + self.unknown9 = self.raw[264:self.length] if self.has_exth: self.exth_offset = 16 + self.length @@ -334,7 +337,7 @@ class MOBIHeader(object): # {{{ self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] def __str__(self): - ans = ['*'*20 + ' MOBI Header '+ '*'*20] + ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20] a = ans.append i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x)) ans.append('Compression: %s'%self.compression) @@ -349,10 +352,11 @@ class MOBIHeader(object): # {{{ ans.append('Encoding: %s'%self.encoding) ans.append('UID: %r'%self.uid) ans.append('File version: %d'%self.file_version) - ans.append('Reserved: %r'%self.reserved) + ans.append('Meta Orth Index: %d'%self.meta_orth_indx) + ans.append('Meta Infl Index: %d'%self.meta_infl_indx) ans.append('Secondary index record: %d (null val: %d)'%( self.secondary_index_record, NULL_INDEX)) - ans.append('Reserved2: %r'%self.reserved2) + ans.append('Reserved: %r'%self.reserved) ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, self.first_non_book_record)) ans.append('Full name offset: %d'%self.fullname_offset) @@ -377,9 +381,8 @@ class MOBIHeader(object): # {{{ ans.append('DRM Flags: %r'%self.drm_flags) if self.has_extra_data_flags: ans.append('Unknown4: %r'%self.unknown4) - ans.append('First content record: %d'% self.first_content_record) - ans.append('Last content record: %d'% self.last_content_record) - ans.append('Unknown5: %d'% self.unknown5) + ans.append('FDST Index: %d'% self.fdst_idx) + ans.append('FDST Count: %d'% self.fdst_count) ans.append('FCIS number: %d'% self.fcis_number) ans.append('FCIS count: %d'% self.fcis_count) ans.append('FLIS number: %d'% self.flis_number) @@ -398,6 +401,7 @@ class MOBIHeader(object): # {{{ ans.append('Unknown8: %r'%self.unknown8) i('SKEL Index', self.skel_idx) i('Sections Index', self.sect_idx) + i('Unknown8', self.unknown8) i('Other Index', self.oth_idx) i('FDST record', self.fdst_idx) a('FDST Count: %d'%self.fdst_count) @@ -447,28 +451,74 @@ class MOBIFile(object): self.mobi_header = MOBIHeader(self.records[0]) self.huffman_record_nums = [] - if 'huff' in self.mobi_header.compression.lower(): - self.huffman_record_nums = list(xrange(self.mobi_header.huffman_record_offset, - self.mobi_header.huffman_record_offset + - self.mobi_header.huffman_record_count)) - huffrecs = [self.records[r].raw for r in self.huffman_record_nums] - from calibre.ebooks.mobi.huffcdic import HuffReader - huffs = HuffReader(huffrecs) - decompress = huffs.unpack - elif 'palmdoc' in self.mobi_header.compression.lower(): - from calibre.ebooks.compression.palmdoc import decompress_doc - decompress = decompress_doc - else: - decompress = lambda x: x - - self.decompress = decompress - self.kf8_type = None - mh = self.mobi_header + mh = mh8 = self.mobi_header if mh.file_version >= 8: self.kf8_type = 'standalone' elif mh.has_exth and mh.exth.kf8_header_index is not None: self.kf8_type = 'joint' - + kf8i = mh.exth.kf8_header_index + mh8 = MOBIHeader(self.records[kf8i]) + self.mobi8_header = mh8 + + if 'huff' in self.mobi_header.compression.lower(): + from calibre.ebooks.mobi.huffcdic import HuffReader + + def huffit(off, cnt): + huffman_record_nums = list(xrange(off, off+cnt)) + huffrecs = [self.records[r].raw for r in huffman_record_nums] + huffs = HuffReader(huffrecs) + return huffman_record_nums, huffs.unpack + + if self.kf8_type == 'joint': + recs6, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + recs8, d8 = huffit(mh8.huffman_record_offset + kf8i, + mh8.huffman_record_count) + self.huffman_record_nums = recs6 + recs8 + else: + self.huffman_record_nums, d6 = huffit(mh.huffman_record_offset, + mh.huffman_record_count) + d8 = d6 + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + d8 = d6 = decompress_doc + else: + d8 = d6 = lambda x: x + + self.decompress6, self.decompress8 = d6, d8 + +class TextRecord(object): # {{{ + + def __init__(self, idx, record, extra_data_flags, decompress): + self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + raw_trailing_bytes = record.raw[len(self.raw):] + self.raw = decompress(self.raw) + + if 0 in self.trailing_data: + self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) + if 1 in self.trailing_data: + self.trailing_data['indexing'] = self.trailing_data.pop(1) + if 2 in self.trailing_data: + self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + self.trailing_data['raw_bytes'] = raw_trailing_bytes + + for typ, val in self.trailing_data.iteritems(): + if isinstance(typ, int): + print ('Record %d has unknown trailing data of type: %d : %r'% + (idx, typ, val)) + + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.txt'), 'wb') as f: + f.write(self.raw) + with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: + for k, v in self.trailing_data.iteritems(): + raw = '%s : %r\n\n'%(k, v) + f.write(raw.encode('utf-8')) + +# }}} diff --git a/src/calibre/ebooks/mobi/debug/main.py b/src/calibre/ebooks/mobi/debug/main.py index 71844150f1..624da65846 100644 --- a/src/calibre/ebooks/mobi/debug/main.py +++ b/src/calibre/ebooks/mobi/debug/main.py @@ -11,6 +11,7 @@ import sys, os, shutil from calibre.ebooks.mobi.debug.headers import MOBIFile from calibre.ebooks.mobi.debug.mobi6 import inspect_mobi as inspect_mobi6 +from calibre.ebooks.mobi.debug.mobi8 import inspect_mobi as inspect_mobi8 def inspect_mobi(path_or_stream, ddir=None): # {{{ stream = (path_or_stream if hasattr(path_or_stream, 'read') else @@ -27,7 +28,15 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{ inspect_mobi6(f, ddir) elif f.kf8_type == 'joint': p6 = os.path.join(ddir, 'mobi6') + os.mkdir(p6) inspect_mobi6(f, p6) + p8 = os.path.join(ddir, 'mobi8') + os.mkdir(p8) + inspect_mobi8(f, p8) + else: + inspect_mobi8(f, ddir) + + print ('Debug data saved to:', ddir) # }}} diff --git a/src/calibre/ebooks/mobi/debug/mobi6.py b/src/calibre/ebooks/mobi/debug/mobi6.py index 5f0eda4345..640f58c661 100644 --- a/src/calibre/ebooks/mobi/debug/mobi6.py +++ b/src/calibre/ebooks/mobi/debug/mobi6.py @@ -16,9 +16,10 @@ from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import (parse_index_record, parse_tagx_section) from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_tbs, read_font_record) + decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data from calibre.ebooks.mobi.debug import format_bytes +from calibre.ebooks.mobi.debug.headers import TextRecord class TagX(object): # {{{ @@ -472,39 +473,6 @@ class CNCX(object): # {{{ # }}} -class TextRecord(object): # {{{ - - def __init__(self, idx, record, extra_data_flags, decompress): - self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) - raw_trailing_bytes = record.raw[len(self.raw):] - self.raw = decompress(self.raw) - - if 0 in self.trailing_data: - self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) - if 1 in self.trailing_data: - self.trailing_data['indexing'] = self.trailing_data.pop(1) - if 2 in self.trailing_data: - self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) - self.trailing_data['raw_bytes'] = raw_trailing_bytes - - for typ, val in self.trailing_data.iteritems(): - if isinstance(typ, int): - print ('Record %d has unknown trailing data of type: %d : %r'% - (idx, typ, val)) - - self.idx = idx - - def dump(self, folder): - name = '%06d'%self.idx - with open(os.path.join(folder, name+'.txt'), 'wb') as f: - f.write(self.raw) - with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: - for k, v in self.trailing_data.iteritems(): - raw = '%s : %r\n\n'%(k, v) - f.write(raw.encode('utf-8')) - -# }}} - class ImageRecord(object): # {{{ def __init__(self, idx, record, fmt): @@ -781,7 +749,7 @@ class MOBIFile(object): # {{{ if fntbr == NULL_INDEX: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], - self.mobi_header.extra_data_flags, mf.decompress) for r in xrange(1, + self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] self.font_records = [] @@ -833,12 +801,11 @@ def inspect_mobi(mobi_file, ddir): of.write(rec.raw) alltext += rec.raw of.seek(0) - if f.mobi_header.file_version < 8: - root = html.fromstring(alltext.decode('utf-8')) - with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: - of.write(html.tostring(root, pretty_print=True, encoding='utf-8', - include_meta_content_type=True)) + root = html.fromstring(alltext.decode('utf-8')) + with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: + of.write(html.tostring(root, pretty_print=True, encoding='utf-8', + include_meta_content_type=True)) if f.index_header is not None: f.index_record.alltext = alltext @@ -866,7 +833,6 @@ def inspect_mobi(mobi_file, ddir): rec.dump(tdir) - print ('Debug data saved to:', ddir) # }}} diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index eaad81730d..db2b07e53a 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -187,19 +187,13 @@ class BookHeader(object): self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) if self.mobi_version >= 8: - self.skelidx, = struct.unpack_from('>L', raw, 0xFC) - - # Index into
sections in raw_ml - self.dividx, = struct.unpack_from('>L', raw, 0xF8) - - # Index into Other files - self.othidx, = struct.unpack_from('>L', raw, 0x104) + self.dividx, self.skelidx, self.datpidx, self.othidx = \ + struct.unpack_from(b'>4L', raw, 0xF8) # need to use the FDST record to find out how to properly # unpack the raw_ml into pieces it is simply a table of start # and end locations for each flow piece - self.fdstidx, = struct.unpack_from('>L', raw, 0xC0) - self.fdstcnt, = struct.unpack_from('>L', raw, 0xC4) + self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0) # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX