diff --git a/src/calibre/ebooks/mobi/debug/containers.py b/src/calibre/ebooks/mobi/debug/containers.py new file mode 100644 index 0000000000..37c9914f37 --- /dev/null +++ b/src/calibre/ebooks/mobi/debug/containers.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +from struct import unpack_from + +from calibre.ebooks.mobi.debug.headers import EXTHHeader + +class ContainerHeader(object): + + def __init__(self, data): + self.ident = data[:4] + self.record_size, self.type, self.count, self.encoding = unpack_from(b'>IHHI', data, 4) + self.encoding = { + 1252 : 'cp1252', + 65001: 'utf-8', + }.get(self.encoding, repr(self.encoding)) + rest = list(unpack_from(b'>IIIIIIII', data, 16)) + self.num_of_resource_records = rest[2] + self.num_of_non_dummy_resource_records = rest[3] + self.offset_to_href_record = rest[4] + self.unknowns1 = rest[:2] + self.unknowns2 = rest[5] + self.header_length = rest[6] + self.title_length = rest[7] + self.resources = [] + self.hrefs = [] + if data[48:52] == b'EXTH': + self.exth = EXTHHeader(data[48:]) + self.title = data[48 + self.exth.length:][:self.title_length].decode(self.encoding) + self.is_image_container = self.exth[539] == 'application/image' + else: + self.exth = ' No EXTH header present ' + self.title = '' + self.is_image_container = False + self.bytes_after_exth = data[self.header_length + self.title_length:] + self.null_bytes_after_exth = len(self.bytes_after_exth) - len(self.bytes_after_exth.replace(b'\0', b'')) + + def add_hrefs(self, data): + # kindlegen inserts a trailing | after the last href + self.hrefs = filter(None, data.decode('utf-8').split('|')) + + def __str__(self): + ans = [('*'*10) + ' Container Header ' + ('*'*10)] + a = ans.append + a('Record size: %d' % self.record_size) + a('Type: %d' % self.type) + a('Total number of records in this container: %d' % self.count) + a('Encoding: %s' % self.encoding) + a('Unknowns1: %s' % self.unknowns1) + a('Num of resource records: %d' % self.num_of_resource_records) + a('Num of non-dummy resource records: %d' % self.num_of_non_dummy_resource_records) + a('Offset to href record: %d' % self.offset_to_href_record) + a('Unknowns2: %s' % self.unknowns2) + a('Header length: %d' % self.header_length) + a('Title Length: %s' % self.title_length) + a('hrefs: %s' % self.hrefs) + a('Null bytes after EXTH: %d' % self.null_bytes_after_exth) + if len(self.bytes_after_exth) != self.null_bytes_after_exth: + a('Non-null bytes present after EXTH header!!!!') + return '\n'.join(ans) + '\n\n' + str(self.exth) + '\n\n' + ('Title: %s' % self.title) + diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 3a50a1dcf9..3c465a251a 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -12,6 +12,7 @@ import sys, os, struct, textwrap from itertools import izip from calibre import CurrentDir +from calibre.ebooks.mobi.debug.containers import ContainerHeader from calibre.ebooks.mobi.debug.headers import TextRecord from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, GuideIndex) @@ -152,9 +153,11 @@ class MOBIFile(object): def extract_resources(self, records): self.resource_map = [] + self.containers = [] known_types = {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE', b'CRES', b'CONT', b'CMET'} + container = None for i, rec in enumerate(records): for (l, r, offset) in self.resource_ranges: @@ -181,7 +184,27 @@ class MOBIFile(object): payload = (font['font_data'] if font['font_data'] else font['raw_data']) prefix, ext = 'fonts', font['ext'] + elif sig == b'CONT': + if payload == b'CONTBOUNDARY': + self.containers.append(container) + container = None + continue + container = ContainerHeader(payload) + elif sig == b'CRES': + container.resources.append(payload) + if container.is_image_container: + payload = payload[12:] + q = what(None, payload) + if q: + prefix, ext = 'hd-images', q + resource_index = len(container.resources) + elif sig == b'\xa0\xa0\xa0\xa0' and len(payload) == 4: + container.resources.append(None) + continue elif sig not in known_types: + if container is not None and len(container.resources) == container.num_of_resource_records: + container.add_hrefs(payload) + continue q = what(None, rec.raw) if q: prefix, ext = 'images', q @@ -274,7 +297,7 @@ def inspect_mobi(mobi_file, ddir): with open(alltext, 'wb') as of: of.write(f.raw_text) - for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows'): + for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows', 'hd-images',): os.mkdir(os.path.join(ddir, x)) for rec in f.text_records: @@ -284,6 +307,10 @@ def inspect_mobi(mobi_file, ddir): with open(os.path.join(ddir, href), 'wb') as fo: fo.write(payload) + for i, container in enumerate(f.containers): + with open(os.path.join(ddir, 'container%d.txt' % (i + 1)), 'wb') as cf: + cf.write(str(container).encode('utf-8')) + if f.fdst: with open(os.path.join(ddir, 'fdst.record'), 'wb') as fo: fo.write(str(f.fdst).encode('utf-8'))