From c0bb5902ac5d712d9549438ff96ad1e7a1e25f51 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jul 2011 09:50:33 -0600 Subject: [PATCH] Mobi debug: Dump text/image and unparsed binary records --- src/calibre/ebooks/mobi/debug.py | 125 +++++++++++++++++++++-- src/calibre/ebooks/mobi/writer2/utils.py | 23 +++++ 2 files changed, 142 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 8ffa3aa15b..2dbe363e7c 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -7,11 +7,13 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime, sys, os +import struct, datetime, sys, os, shutil from collections import OrderedDict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language -from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint +from calibre.ebooks.mobi.writer2.utils import (decode_hex_number, decint, + get_trailing_data) +from calibre.utils.magick.draw import identify_data # PalmDB {{{ class PalmDOCAttributes(object): @@ -278,6 +280,7 @@ class MOBIHeader(object): # {{{ self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 self.has_fcis_flis = False self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False + self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[180:192] self.first_content_record, self.last_content_record = \ @@ -726,6 +729,63 @@ class CNCX(object) : # {{{ # }}} +class TextRecord(object): # {{{ + + def __init__(self, idx, record, extra_data_flags, decompress): + self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) + self.raw = decompress(self.raw) + if 0 in self.trailing_data: + self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0) + if 1 in self.trailing_data: + self.trailing_data['indexing'] = self.trailing_data.pop(1) + if 2 in self.trailing_data: + self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2) + + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.txt'), 'wb') as f: + f.write(self.raw) + with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f: + for k, v in self.trailing_data.iteritems(): + raw = '%s : %r\n\n'%(k, v) + f.write(raw.encode('utf-8')) + +# }}} + +class ImageRecord(object): # {{{ + + def __init__(self, idx, record, fmt): + self.raw = record.raw + self.fmt = fmt + self.idx = idx + + def dump(self, folder): + name = '%06d'%self.idx + with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f: + f.write(self.raw) + +# }}} + +class BinaryRecord(object): # {{{ + + def __init__(self, idx, record): + self.raw = record.raw + sig = self.raw[:4] + name = '%06d'%idx + if sig in (b'FCIS', b'FLIS', b'SRCS'): + name += '-' + sig.decode('ascii') + elif sig == b'\xe9\x8e\r\n': + name += '-' + 'EOF' + self.name = name + + def dump(self, folder): + with open(os.path.join(folder, self.name+'.bin'), 'wb') as f: + f.write(self.raw) + +# }}} + class MOBIFile(object): # {{{ def __init__(self, stream): @@ -754,7 +814,22 @@ class MOBIFile(object): # {{{ self.mobi_header = MOBIHeader(self.records[0]) + if 'huff' in self.mobi_header.compression.lower(): + huffrecs = [r.raw for r in + xrange(self.mobi_header.huffman_record_offset, + self.mobi_header.huffman_record_offset + + self.mobi_header.huffman_record_count)] + from calibre.ebooks.mobi.huffcdic import HuffReader + huffs = HuffReader(huffrecs) + decompress = huffs.decompress + elif 'palmdoc' in self.mobi_header.compression.lower(): + from calibre.ebooks.compression.palmdoc import decompress_doc + decompress = decompress_doc + else: + decompress = lambda x: x + self.index_header = None + self.indexing_record_nums = set() pir = self.mobi_header.primary_index_record if pir != 0xffffffff: self.index_header = IndexHeader(self.records[pir]) @@ -763,6 +838,34 @@ class MOBIFile(object): # {{{ self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1], self.index_header, self.cncx) + self.indexing_record_nums = set(xrange(pir, + pir+2+self.index_header.num_of_cncx_blocks)) + + + ntr = self.mobi_header.number_of_text_records + fntbr = self.mobi_header.first_non_book_record + fii = self.mobi_header.first_image_index + if fntbr == 0xffffffff: + fntbr = len(self.records) + self.text_records = [TextRecord(r, self.records[r], + self.mobi_header.extra_data_flags, decompress) for r in xrange(1, + min(len(self.records), ntr+1))] + self.image_records, self.binary_records = [], [] + for i in xrange(fntbr, len(self.records)): + if i in self.indexing_record_nums: + continue + r = self.records[i] + fmt = None + if i >= fii and r.raw[:4] not in (b'FLIS', b'FCIS', b'SRCS', + b'\xe9\x8e\r\n'): + try: + width, height, fmt = identify_data(r.raw) + except: + pass + if fmt is not None: + self.image_records.append(ImageRecord(i, r, fmt)) + else: + self.binary_records.append(BinaryRecord(i, r)) def print_header(self, f=sys.stdout): @@ -776,13 +879,16 @@ class MOBIFile(object): # {{{ print (str(self.mobi_header).encode('utf-8'), file=f) # }}} -def inspect_mobi(path_or_stream): +def inspect_mobi(path_or_stream, prefix='decompiled'): stream = (path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')) f = MOBIFile(stream) - ddir = 'debug_' + os.path.splitext(os.path.basename(stream.name))[0] - if not os.path.exists(ddir): - os.mkdir(ddir) + ddir = prefix + '_' + os.path.splitext(os.path.basename(stream.name))[0] + try: + shutil.rmtree(ddir) + except: + pass + os.mkdir(ddir) with open(os.path.join(ddir, 'header.txt'), 'wb') as out: f.print_header(f=out) if f.index_header is not None: @@ -793,6 +899,13 @@ def inspect_mobi(path_or_stream): print('\n\n', file=out) print(str(f.index_record), file=out) + for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), + ('binary', 'binary_records')]: + tdir = os.path.join(ddir, tdir) + os.mkdir(tdir) + for rec in getattr(f, attr): + rec.dump(tdir) + print ('Debug data saved to:', ddir) def main(): diff --git a/src/calibre/ebooks/mobi/writer2/utils.py b/src/calibre/ebooks/mobi/writer2/utils.py index 8166bdf328..708b9152d4 100644 --- a/src/calibre/ebooks/mobi/writer2/utils.py +++ b/src/calibre/ebooks/mobi/writer2/utils.py @@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct +from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail @@ -150,4 +151,26 @@ def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): scale -= 0.05 return data +def get_trailing_data(record, extra_data_flags): + ''' + Given a text record as a bytestring and the extra data flags from the MOBI + header, return the trailing data as a dictionary, mapping bit number to + data as bytestring. Also returns the record - all trailing data. + + :return: Trailing data, record - trailing data + ''' + data = OrderedDict() + for i in xrange(16, -1, -1): + flag = 2**i + if flag & extra_data_flags: + if i == 0: + # Only the first two bits are used for the size since there can + # never be more than 3 trailing multibyte chars + sz = ord(record[-1]) & 0b11 + consumed = 1 + else: + sz, consumed = decint(record, forward=False) + data[i] = record[-(sz+consumed):-consumed] + record = record[:-(sz+consumed)] + return data, record