mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Mobi debug: Dump text/image and unparsed binary records
This commit is contained in:
parent
c7ea8f4886
commit
c0bb5902ac
@ -7,11 +7,13 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import struct, datetime, sys, os
|
import struct, datetime, sys, os, shutil
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from calibre.utils.date import utc_tz
|
from calibre.utils.date import utc_tz
|
||||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
||||||
from calibre.ebooks.mobi.writer2.utils import decode_hex_number, decint
|
from calibre.ebooks.mobi.writer2.utils import (decode_hex_number, decint,
|
||||||
|
get_trailing_data)
|
||||||
|
from calibre.utils.magick.draw import identify_data
|
||||||
|
|
||||||
# PalmDB {{{
|
# PalmDB {{{
|
||||||
class PalmDOCAttributes(object):
|
class PalmDOCAttributes(object):
|
||||||
@ -278,6 +280,7 @@ class MOBIHeader(object): # {{{
|
|||||||
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
|
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
|
||||||
self.has_fcis_flis = False
|
self.has_fcis_flis = False
|
||||||
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
|
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
|
||||||
|
self.extra_data_flags = 0
|
||||||
if self.has_extra_data_flags:
|
if self.has_extra_data_flags:
|
||||||
self.unknown4 = self.raw[180:192]
|
self.unknown4 = self.raw[180:192]
|
||||||
self.first_content_record, self.last_content_record = \
|
self.first_content_record, self.last_content_record = \
|
||||||
@ -726,6 +729,63 @@ class CNCX(object) : # {{{
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
class TextRecord(object): # {{{
|
||||||
|
|
||||||
|
def __init__(self, idx, record, extra_data_flags, decompress):
|
||||||
|
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
|
||||||
|
self.raw = decompress(self.raw)
|
||||||
|
if 0 in self.trailing_data:
|
||||||
|
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
|
||||||
|
if 1 in self.trailing_data:
|
||||||
|
self.trailing_data['indexing'] = self.trailing_data.pop(1)
|
||||||
|
if 2 in self.trailing_data:
|
||||||
|
self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
|
||||||
|
|
||||||
|
self.idx = idx
|
||||||
|
|
||||||
|
def dump(self, folder):
|
||||||
|
name = '%06d'%self.idx
|
||||||
|
with open(os.path.join(folder, name+'.txt'), 'wb') as f:
|
||||||
|
f.write(self.raw)
|
||||||
|
with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f:
|
||||||
|
for k, v in self.trailing_data.iteritems():
|
||||||
|
raw = '%s : %r\n\n'%(k, v)
|
||||||
|
f.write(raw.encode('utf-8'))
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
class ImageRecord(object): # {{{
|
||||||
|
|
||||||
|
def __init__(self, idx, record, fmt):
|
||||||
|
self.raw = record.raw
|
||||||
|
self.fmt = fmt
|
||||||
|
self.idx = idx
|
||||||
|
|
||||||
|
def dump(self, folder):
|
||||||
|
name = '%06d'%self.idx
|
||||||
|
with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f:
|
||||||
|
f.write(self.raw)
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
class BinaryRecord(object): # {{{
|
||||||
|
|
||||||
|
def __init__(self, idx, record):
|
||||||
|
self.raw = record.raw
|
||||||
|
sig = self.raw[:4]
|
||||||
|
name = '%06d'%idx
|
||||||
|
if sig in (b'FCIS', b'FLIS', b'SRCS'):
|
||||||
|
name += '-' + sig.decode('ascii')
|
||||||
|
elif sig == b'\xe9\x8e\r\n':
|
||||||
|
name += '-' + 'EOF'
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def dump(self, folder):
|
||||||
|
with open(os.path.join(folder, self.name+'.bin'), 'wb') as f:
|
||||||
|
f.write(self.raw)
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
class MOBIFile(object): # {{{
|
class MOBIFile(object): # {{{
|
||||||
|
|
||||||
def __init__(self, stream):
|
def __init__(self, stream):
|
||||||
@ -754,7 +814,22 @@ class MOBIFile(object): # {{{
|
|||||||
|
|
||||||
self.mobi_header = MOBIHeader(self.records[0])
|
self.mobi_header = MOBIHeader(self.records[0])
|
||||||
|
|
||||||
|
if 'huff' in self.mobi_header.compression.lower():
|
||||||
|
huffrecs = [r.raw for r in
|
||||||
|
xrange(self.mobi_header.huffman_record_offset,
|
||||||
|
self.mobi_header.huffman_record_offset +
|
||||||
|
self.mobi_header.huffman_record_count)]
|
||||||
|
from calibre.ebooks.mobi.huffcdic import HuffReader
|
||||||
|
huffs = HuffReader(huffrecs)
|
||||||
|
decompress = huffs.decompress
|
||||||
|
elif 'palmdoc' in self.mobi_header.compression.lower():
|
||||||
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||||
|
decompress = decompress_doc
|
||||||
|
else:
|
||||||
|
decompress = lambda x: x
|
||||||
|
|
||||||
self.index_header = None
|
self.index_header = None
|
||||||
|
self.indexing_record_nums = set()
|
||||||
pir = self.mobi_header.primary_index_record
|
pir = self.mobi_header.primary_index_record
|
||||||
if pir != 0xffffffff:
|
if pir != 0xffffffff:
|
||||||
self.index_header = IndexHeader(self.records[pir])
|
self.index_header = IndexHeader(self.records[pir])
|
||||||
@ -763,6 +838,34 @@ class MOBIFile(object): # {{{
|
|||||||
self.index_header.index_encoding)
|
self.index_header.index_encoding)
|
||||||
self.index_record = IndexRecord(self.records[pir+1],
|
self.index_record = IndexRecord(self.records[pir+1],
|
||||||
self.index_header, self.cncx)
|
self.index_header, self.cncx)
|
||||||
|
self.indexing_record_nums = set(xrange(pir,
|
||||||
|
pir+2+self.index_header.num_of_cncx_blocks))
|
||||||
|
|
||||||
|
|
||||||
|
ntr = self.mobi_header.number_of_text_records
|
||||||
|
fntbr = self.mobi_header.first_non_book_record
|
||||||
|
fii = self.mobi_header.first_image_index
|
||||||
|
if fntbr == 0xffffffff:
|
||||||
|
fntbr = len(self.records)
|
||||||
|
self.text_records = [TextRecord(r, self.records[r],
|
||||||
|
self.mobi_header.extra_data_flags, decompress) for r in xrange(1,
|
||||||
|
min(len(self.records), ntr+1))]
|
||||||
|
self.image_records, self.binary_records = [], []
|
||||||
|
for i in xrange(fntbr, len(self.records)):
|
||||||
|
if i in self.indexing_record_nums:
|
||||||
|
continue
|
||||||
|
r = self.records[i]
|
||||||
|
fmt = None
|
||||||
|
if i >= fii and r.raw[:4] not in (b'FLIS', b'FCIS', b'SRCS',
|
||||||
|
b'\xe9\x8e\r\n'):
|
||||||
|
try:
|
||||||
|
width, height, fmt = identify_data(r.raw)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if fmt is not None:
|
||||||
|
self.image_records.append(ImageRecord(i, r, fmt))
|
||||||
|
else:
|
||||||
|
self.binary_records.append(BinaryRecord(i, r))
|
||||||
|
|
||||||
|
|
||||||
def print_header(self, f=sys.stdout):
|
def print_header(self, f=sys.stdout):
|
||||||
@ -776,13 +879,16 @@ class MOBIFile(object): # {{{
|
|||||||
print (str(self.mobi_header).encode('utf-8'), file=f)
|
print (str(self.mobi_header).encode('utf-8'), file=f)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def inspect_mobi(path_or_stream):
|
def inspect_mobi(path_or_stream, prefix='decompiled'):
|
||||||
stream = (path_or_stream if hasattr(path_or_stream, 'read') else
|
stream = (path_or_stream if hasattr(path_or_stream, 'read') else
|
||||||
open(path_or_stream, 'rb'))
|
open(path_or_stream, 'rb'))
|
||||||
f = MOBIFile(stream)
|
f = MOBIFile(stream)
|
||||||
ddir = 'debug_' + os.path.splitext(os.path.basename(stream.name))[0]
|
ddir = prefix + '_' + os.path.splitext(os.path.basename(stream.name))[0]
|
||||||
if not os.path.exists(ddir):
|
try:
|
||||||
os.mkdir(ddir)
|
shutil.rmtree(ddir)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
os.mkdir(ddir)
|
||||||
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
|
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
|
||||||
f.print_header(f=out)
|
f.print_header(f=out)
|
||||||
if f.index_header is not None:
|
if f.index_header is not None:
|
||||||
@ -793,6 +899,13 @@ def inspect_mobi(path_or_stream):
|
|||||||
print('\n\n', file=out)
|
print('\n\n', file=out)
|
||||||
print(str(f.index_record), file=out)
|
print(str(f.index_record), file=out)
|
||||||
|
|
||||||
|
for tdir, attr in [('text', 'text_records'), ('images', 'image_records'),
|
||||||
|
('binary', 'binary_records')]:
|
||||||
|
tdir = os.path.join(ddir, tdir)
|
||||||
|
os.mkdir(tdir)
|
||||||
|
for rec in getattr(f, attr):
|
||||||
|
rec.dump(tdir)
|
||||||
|
|
||||||
print ('Debug data saved to:', ddir)
|
print ('Debug data saved to:', ddir)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import struct
|
import struct
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
||||||
|
|
||||||
@ -150,4 +151,26 @@ def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
|
|||||||
scale -= 0.05
|
scale -= 0.05
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def get_trailing_data(record, extra_data_flags):
|
||||||
|
'''
|
||||||
|
Given a text record as a bytestring and the extra data flags from the MOBI
|
||||||
|
header, return the trailing data as a dictionary, mapping bit number to
|
||||||
|
data as bytestring. Also returns the record - all trailing data.
|
||||||
|
|
||||||
|
:return: Trailing data, record - trailing data
|
||||||
|
'''
|
||||||
|
data = OrderedDict()
|
||||||
|
for i in xrange(16, -1, -1):
|
||||||
|
flag = 2**i
|
||||||
|
if flag & extra_data_flags:
|
||||||
|
if i == 0:
|
||||||
|
# Only the first two bits are used for the size since there can
|
||||||
|
# never be more than 3 trailing multibyte chars
|
||||||
|
sz = ord(record[-1]) & 0b11
|
||||||
|
consumed = 1
|
||||||
|
else:
|
||||||
|
sz, consumed = decint(record, forward=False)
|
||||||
|
data[i] = record[-(sz+consumed):-consumed]
|
||||||
|
record = record[:-(sz+consumed)]
|
||||||
|
return data, record
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user