diff --git a/src/calibre/ebooks/conversion/plugins/mobi_input.py b/src/calibre/ebooks/conversion/plugins/mobi_input.py index a6aa05a574..9d71b69891 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_input.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py @@ -34,10 +34,12 @@ class MOBIInput(InputFormatPlugin): accelerators): if os.environ.get('USE_MOBIUNPACK', None) is not None: + pos = stream.tell() try: return run_mobi_unpack(stream, options, log, accelerators) except Exception: log.exception('mobi_unpack code not working') + stream.seek(pos) from calibre.ebooks.mobi.reader.mobi6 import MobiReader from lxml import html diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 0444105003..dabd827060 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, datetime, sys, os, shutil, zlib +import struct, datetime, sys, os, shutil from collections import OrderedDict, defaultdict from lxml import html @@ -15,7 +15,7 @@ from lxml import html from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_tbs) + get_trailing_data, decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data def format_bytes(byts): @@ -1154,26 +1154,13 @@ class FontRecord(object): # {{{ def __init__(self, idx, record): self.raw = record.raw name = '%06d'%idx - (self.uncompressed_size, self.unknown1, self.unknown2) = \ - struct.unpack_from(b'>LLL', self.raw, 4) - self.payload = self.raw[4:] - self.ext = 'unknown' - if self.unknown1 == 1: - self.zlib_header = self.raw[self.unknown2:self.unknown2+2] - self.payload = zlib.decompress(self.raw[self.unknown2+2:-4], -15) - hdr = self.payload[:4] - if hdr in {b'\0\1\0\0', b'true', b'ttcf'}: - self.ext = 'ttf' - if self.uncompressed_size != len(self.payload): - raise ValueError('Font record uncompressed size mismatch', - ' expected: %d actual: %d'%(self.uncompressed_size, - len(self.payload))) - else: - print ('Unknown font record with fields: %s' % - [self.uncompressed_size, self.unknown1, self.unknown2]) - print ('\tAdditional fields: %s'%(( - struct.unpack_from(b'>LL', self.raw, 16),))) - self.name = '%s.%s'%(name, self.ext) + self.font = read_font_record(self.raw) + if self.font['err']: + raise ValueError('Failed to read font record: %s Headers: %s'%( + self.font['err'], self.font['headers'])) + self.payload = (self.font['font_data'] if self.font['font_data'] else + self.font['raw_data']) + self.name = '%s.%s'%(name, self.font['ext']) def dump(self, folder): with open(os.path.join(folder, self.name), 'wb') as f: diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index ed0088c168..f5421bc9ea 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, re, os, zlib, imghdr +import struct, re, os, imghdr from collections import namedtuple from itertools import repeat @@ -16,6 +16,7 @@ from calibre.ebooks.mobi.reader.index import read_index from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup from calibre.ebooks.metadata.opf2 import Guide, OPFCreator +from calibre.ebooks.mobi.utils import read_font_record Part = namedtuple('Part', 'num type filename start end aid') @@ -339,39 +340,16 @@ class Mobi8Reader(object): b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: pass # Ignore these records elif typ == b'FONT': - # fonts only exist in K8 ebooks - # Format: - # bytes 0 - 3: 'FONT' - # bytes 4 - 7: ?? Expanded size in bytes ?? - # bytes 8 - 11: ?? number of files ?? - # bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24) - # bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib? - # The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end - try: - fields = struct.unpack_from(b'>LLLLL', data, 4) - except: - fields = None - # self.log.debug('Font record fields: %s'%(fields,)) - cdata = data[26:-4] - ext = 'dat' - try: - uncompressed_data = zlib.decompress(cdata, -15) - except: - self.log.warn('Failed to uncompress embedded font %d: ' - 'Fields: %s' % (fname_idx, fields,)) - uncompressed_data = data[4:] - ext = 'failed' - if len(uncompressed_data) < 200: - self.log.warn('Failed to uncompress embedded font %d: ' - 'Fields: %s' % (fname_idx, fields,)) - uncompressed_data = data[4:] - ext = 'failed' - hdr = uncompressed_data[:4] - if ext != 'failed' and hdr in {b'\0\1\0\0', b'true', b'ttcf'}: - ext = 'ttf' - href = "fonts/%05d.%s" % (fname_idx, ext) + font = read_font_record(data) + href = "fonts/%05d.%s" % (fname_idx, font['ext']) + if font['err']: + self.log.warn('Reading font record %d failed: %s'%( + fname_idx, font['err'])) + if font['headers']: + self.log.debug('Font record headers: %s'%font['headers']) with open(href.replace('/', os.sep), 'wb') as f: - f.write(uncompressed_data) + f.write(font['font_data'] if font['font_data'] else + font['raw_data']) else: imgtype = imghdr.what(None, data) if imgtype is None: diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 3a9cf1c0ba..feca894a66 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, string, imghdr +import struct, string, imghdr, zlib from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail @@ -373,4 +373,116 @@ def mobify_image(data): data = im.export('gif') return data +def read_zlib_header(header): + header = bytearray(header) + # See sec 2.2 of RFC 1950 for the zlib stream format + # http://www.ietf.org/rfc/rfc1950.txt + if (header[0]*256 + header[1])%31 != 0: + return None, 'Bad zlib header, FCHECK failed' + + cmf = header[0] & 0b1111 + cinfo = header[0] >> 4 + if cmf != 8: + return None, 'Unknown zlib compression method: %d'%cmf + if cinfo > 7: + return None, 'Invalid CINFO field in zlib header: %d'%cinfo + fdict = (header[1]&0b10000)>>5 + if fdict != 0: + return None, 'FDICT based zlib compression not supported' + wbits = cinfo + 8 + return wbits, None + + +def read_font_record(data, extent=1040): # {{{ + ''' + Return the font encoded in the MOBI FONT record represented by data. + The return value in a dict with fields raw_data, font_data, err, ext, + headers. + + :param extent: The number of obfuscated bytes. So far I have only + encountered files with 1040 obfuscated bytes. If you encounter an + obfuscated record for which this function fails, try different extent + values (easily automated). + + raw_data is the raw data in the font record + font_data is the decoded font_data or None if an error occurred + err is not None if some error occurred + ext is the font type (ttf for TrueType, dat for unknown and failed if an + error occurred) + headers is the list of decoded headers from the font record or None if + decoding failed + ''' + # Format: + # bytes 0 - 3: 'FONT' + # bytes 4 - 7: Uncompressed size + # bytes 8 - 11: flags + # bit 1 - zlib compression + # bit 2 - XOR obfuscated + # bytes 12 - 15: offset to start of compressed data + # bytes 16 - 19: length of XOR string + # bytes 19 - 23: offset to start of XOR data + # The zlib compressed data begins with 2 bytes of header and + # has 4 bytes of checksum at the end + ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed', + 'headers':None} + + try: + usize, flags, dstart, xor_len, xor_start = struct.unpack_from( + b'>LLLLL', data, 4) + except: + ans['err'] = 'Failed to read font record header fields' + return ans + font_data = data[dstart:] + ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len, + 'xor_start':xor_start, 'dstart':dstart} + + if flags & 0b10: + # De-obfuscate the data + key = bytearray(data[xor_start:xor_start+xor_len]) + buf = bytearray(font_data) + extent = len(font_data) if extent is None else extent + extent = min(extent, len(font_data)) + + for n in xrange(extent): + buf[n] ^= key[n%xor_len] # XOR of buf and key + + font_data = bytes(buf) + + if flags & 0b1: + # ZLIB compressed data + wbits, err = read_zlib_header(font_data[:2]) + if err is not None: + ans['err'] = err + return ans + adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4) + try: + # remove two bytes of zlib header and 4 bytes of trailing checksum + # negative wbits indicates no standard gzip header + font_data = zlib.decompress(font_data[2:-4], -wbits, usize) + except Exception as e: + ans['err'] = 'Failed to zlib decompress font data (%s)'%e + return ans + + if len(font_data) != usize: + ans['err'] = 'Uncompressed font size mismatch' + return ans + + if False: + # For some reason these almost never match, probably Amazon has a + # buggy Adler32 implementation + sig = (zlib.adler32(font_data) & 0xffffffff) + if sig != adler32: + ans['err'] = ('Adler checksum did not match. Stored: %d ' + 'Calculated: %d')%(adler32, sig) + return ans + + ans['font_data'] = font_data + ans['ext'] = ('ttf' if font_data[:4] in {b'\0\1\0\0', b'true', b'ttcf'} + else 'dat') + + return ans +# }}} + + +