When dumping joint MOBI files, properly extract all image/font resources and only put resources upto the KF8 boundary in the mobi6 part

This commit is contained in:
Kovid Goyal 2014-07-16 12:48:06 +05:30
parent 843624f6c2
commit f0a890e8c1
3 changed files with 31 additions and 17 deletions

View File

@ -383,6 +383,13 @@ class MOBIHeader(object): # {{{
if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
setattr(self, x, self.header_offset+getattr(self, x))
# Try to find the first non-text record
self.first_resource_record = offset + 1 + self.number_of_text_records # Default to first record after all text records
pointer = min(getattr(self, 'first_non_book_record', NULL_INDEX), getattr(self, 'first_image_index', NULL_INDEX))
if pointer != NULL_INDEX:
self.first_resource_record = max(pointer, self.first_resource_record)
self.last_resource_record = NULL_INDEX
if self.has_exth:
self.exth_offset = 16 + self.length
@ -391,6 +398,10 @@ class MOBIHeader(object): # {{{
self.end_of_exth = self.exth_offset + self.exth.length
self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset]
if self.exth.kf8_header_index is not None and offset == 0:
# MOBI 6 header in a joint file, adjust self.last_resource_record
self.last_resource_record = self.exth.kf8_header_index - 2
def __str__(self):
ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]

View File

@ -41,7 +41,7 @@ class SecondaryIndexHeader(object): # {{{
def __init__(self, record):
self.record = record
raw = self.record.raw
#open('/t/index_header.bin', 'wb').write(raw)
# open('/t/index_header.bin', 'wb').write(raw)
if raw[:4] != b'INDX':
raise ValueError('Invalid Secondary Index Record')
self.header_length, = struct.unpack('>I', raw[4:8])
@ -136,7 +136,7 @@ class IndexHeader(object): # {{{
def __init__(self, record):
self.record = record
raw = self.record.raw
#open('/t/index_header.bin', 'wb').write(raw)
# open('/t/index_header.bin', 'wb').write(raw)
if raw[:4] != b'INDX':
raise ValueError('Invalid Primary Index Record')
@ -492,7 +492,7 @@ class BinaryRecord(object): # {{{
sig = self.raw[:4]
name = '%06d'%idx
if sig in {b'FCIS', b'FLIS', b'SRCS', b'DATP', b'RESC', b'BOUN',
b'FDST', b'AUDI', b'VIDE',}:
b'FDST', b'AUDI', b'VIDE', b'CRES', b'CONT', b'CMET'}:
name += '-' + sig.decode('ascii')
elif sig == b'\xe9\x8e\r\n':
name += '-' + 'EOF'
@ -743,17 +743,14 @@ class MOBIFile(object): # {{{
self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi))
ntr = self.mobi_header.number_of_text_records
fntbr = self.mobi_header.first_non_book_record
fii = self.mobi_header.first_image_index
if fntbr == NULL_INDEX:
fntbr = len(self.records)
self.text_records = [TextRecord(r, self.records[r],
self.mobi_header.extra_data_flags, mf.decompress6) for r in xrange(1,
min(len(self.records), ntr+1))]
self.image_records, self.binary_records = [], []
self.font_records = []
image_index = 0
for i in xrange(fntbr, len(self.records)):
for i in xrange(self.mobi_header.first_resource_record, min(self.mobi_header.last_resource_record, len(self.records))):
if i in self.indexing_record_nums or i in self.huffman_record_nums:
continue
image_index += 1
@ -761,7 +758,7 @@ class MOBIFile(object): # {{{
fmt = None
if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS',
b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
b'AUDI', b'VIDE', b'FONT'}:
b'AUDI', b'VIDE', b'FONT', b'CRES', b'CONT', b'CMET'}:
try:
fmt = what(None, r.raw)
except:
@ -832,7 +829,6 @@ def inspect_mobi(mobi_file, ddir):
rec.dump(tdir)
# }}}

View File

@ -73,12 +73,11 @@ class MOBIFile(object):
h, h8 = mf.mobi_header, mf.mobi8_header
first_text_record = 1
offset = 0
res_end = len(mf.records)
self.resource_ranges = [(h8.first_resource_record, h8.last_resource_record, h8.first_image_index)]
if mf.kf8_type == 'joint':
offset = h.exth.kf8_header_index
res_end = offset - 1
self.resource_ranges.insert(0, (h.first_resource_record, h.last_resource_record, h.first_image_index))
self.resource_records = mf.records[h.first_non_book_record:res_end]
self.text_records = [TextRecord(i, r, h8.extra_data_flags,
mf.decompress8) for i, r in
enumerate(mf.records[first_text_record+offset:
@ -86,7 +85,7 @@ class MOBIFile(object):
self.raw_text = b''.join(r.raw for r in self.text_records)
self.header = self.mf.mobi8_header
self.extract_resources()
self.extract_resources(mf.records)
self.read_fdst()
self.read_indices()
self.build_files()
@ -151,13 +150,21 @@ class MOBIFile(object):
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
f.write(raw)
def extract_resources(self):
def extract_resources(self, records):
self.resource_map = []
known_types = {b'FLIS', b'FCIS', b'SRCS',
b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
b'AUDI', b'VIDE'}
b'AUDI', b'VIDE', b'CRES', b'CONT', b'CMET'}
for i, rec in enumerate(self.resource_records):
for i, rec in enumerate(records):
for (l, r, offset) in self.resource_ranges:
if l <= i <= r:
resource_index = i + 1
if offset is not None and resource_index >= offset:
resource_index -= offset
break
else:
continue
sig = rec.raw[:4]
payload = rec.raw
ext = 'dat'
@ -185,7 +192,7 @@ class MOBIFile(object):
elif sig in known_types:
suffix = '-' + sig.decode('ascii')
self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext),
self.resource_map.append(('%s/%06d%s.%s'%(prefix, resource_index, suffix, ext),
payload))
def read_tbs(self):