When dumping MOBI files for debugging also dump container records and HD images

This commit is contained in:
Kovid Goyal 2014-07-16 17:41:55 +05:30
parent f0a890e8c1
commit 63f9cbda0b
2 changed files with 94 additions and 1 deletions

View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from struct import unpack_from
from calibre.ebooks.mobi.debug.headers import EXTHHeader
class ContainerHeader(object):
def __init__(self, data):
self.ident = data[:4]
self.record_size, self.type, self.count, self.encoding = unpack_from(b'>IHHI', data, 4)
self.encoding = {
1252 : 'cp1252',
65001: 'utf-8',
}.get(self.encoding, repr(self.encoding))
rest = list(unpack_from(b'>IIIIIIII', data, 16))
self.num_of_resource_records = rest[2]
self.num_of_non_dummy_resource_records = rest[3]
self.offset_to_href_record = rest[4]
self.unknowns1 = rest[:2]
self.unknowns2 = rest[5]
self.header_length = rest[6]
self.title_length = rest[7]
self.resources = []
self.hrefs = []
if data[48:52] == b'EXTH':
self.exth = EXTHHeader(data[48:])
self.title = data[48 + self.exth.length:][:self.title_length].decode(self.encoding)
self.is_image_container = self.exth[539] == 'application/image'
else:
self.exth = ' No EXTH header present '
self.title = ''
self.is_image_container = False
self.bytes_after_exth = data[self.header_length + self.title_length:]
self.null_bytes_after_exth = len(self.bytes_after_exth) - len(self.bytes_after_exth.replace(b'\0', b''))
def add_hrefs(self, data):
# kindlegen inserts a trailing | after the last href
self.hrefs = filter(None, data.decode('utf-8').split('|'))
def __str__(self):
ans = [('*'*10) + ' Container Header ' + ('*'*10)]
a = ans.append
a('Record size: %d' % self.record_size)
a('Type: %d' % self.type)
a('Total number of records in this container: %d' % self.count)
a('Encoding: %s' % self.encoding)
a('Unknowns1: %s' % self.unknowns1)
a('Num of resource records: %d' % self.num_of_resource_records)
a('Num of non-dummy resource records: %d' % self.num_of_non_dummy_resource_records)
a('Offset to href record: %d' % self.offset_to_href_record)
a('Unknowns2: %s' % self.unknowns2)
a('Header length: %d' % self.header_length)
a('Title Length: %s' % self.title_length)
a('hrefs: %s' % self.hrefs)
a('Null bytes after EXTH: %d' % self.null_bytes_after_exth)
if len(self.bytes_after_exth) != self.null_bytes_after_exth:
a('Non-null bytes present after EXTH header!!!!')
return '\n'.join(ans) + '\n\n' + str(self.exth) + '\n\n' + ('Title: %s' % self.title)

View File

@ -12,6 +12,7 @@ import sys, os, struct, textwrap
from itertools import izip from itertools import izip
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ebooks.mobi.debug.containers import ContainerHeader
from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.headers import TextRecord
from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
GuideIndex) GuideIndex)
@ -152,9 +153,11 @@ class MOBIFile(object):
def extract_resources(self, records): def extract_resources(self, records):
self.resource_map = [] self.resource_map = []
self.containers = []
known_types = {b'FLIS', b'FCIS', b'SRCS', known_types = {b'FLIS', b'FCIS', b'SRCS',
b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
b'AUDI', b'VIDE', b'CRES', b'CONT', b'CMET'} b'AUDI', b'VIDE', b'CRES', b'CONT', b'CMET'}
container = None
for i, rec in enumerate(records): for i, rec in enumerate(records):
for (l, r, offset) in self.resource_ranges: for (l, r, offset) in self.resource_ranges:
@ -181,7 +184,27 @@ class MOBIFile(object):
payload = (font['font_data'] if font['font_data'] else payload = (font['font_data'] if font['font_data'] else
font['raw_data']) font['raw_data'])
prefix, ext = 'fonts', font['ext'] prefix, ext = 'fonts', font['ext']
elif sig == b'CONT':
if payload == b'CONTBOUNDARY':
self.containers.append(container)
container = None
continue
container = ContainerHeader(payload)
elif sig == b'CRES':
container.resources.append(payload)
if container.is_image_container:
payload = payload[12:]
q = what(None, payload)
if q:
prefix, ext = 'hd-images', q
resource_index = len(container.resources)
elif sig == b'\xa0\xa0\xa0\xa0' and len(payload) == 4:
container.resources.append(None)
continue
elif sig not in known_types: elif sig not in known_types:
if container is not None and len(container.resources) == container.num_of_resource_records:
container.add_hrefs(payload)
continue
q = what(None, rec.raw) q = what(None, rec.raw)
if q: if q:
prefix, ext = 'images', q prefix, ext = 'images', q
@ -274,7 +297,7 @@ def inspect_mobi(mobi_file, ddir):
with open(alltext, 'wb') as of: with open(alltext, 'wb') as of:
of.write(f.raw_text) of.write(f.raw_text)
for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'): for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows', 'hd-images',):
os.mkdir(os.path.join(ddir, x)) os.mkdir(os.path.join(ddir, x))
for rec in f.text_records: for rec in f.text_records:
@ -284,6 +307,10 @@ def inspect_mobi(mobi_file, ddir):
with open(os.path.join(ddir, href), 'wb') as fo: with open(os.path.join(ddir, href), 'wb') as fo:
fo.write(payload) fo.write(payload)
for i, container in enumerate(f.containers):
with open(os.path.join(ddir, 'container%d.txt' % (i + 1)), 'wb') as cf:
cf.write(str(container).encode('utf-8'))
if f.fdst: if f.fdst:
with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo: with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo:
fo.write(str(f.fdst).encode('utf-8')) fo.write(str(f.fdst).encode('utf-8'))