KF8 Input: Add support for KF8 files with obfuscated embedded fonts

This commit is contained in:
Kovid Goyal 2012-03-14 12:32:13 +05:30
parent cdfb1214e9
commit a8d46f2f40
4 changed files with 135 additions and 56 deletions

View File

@ -34,10 +34,12 @@ class MOBIInput(InputFormatPlugin):
accelerators):
if os.environ.get('USE_MOBIUNPACK', None) is not None:
pos = stream.tell()
try:
return run_mobi_unpack(stream, options, log, accelerators)
except Exception:
log.exception('mobi_unpack code not working')
stream.seek(pos)
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from lxml import html

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, datetime, sys, os, shutil, zlib
import struct, datetime, sys, os, shutil
from collections import OrderedDict, defaultdict
from lxml import html
@ -15,7 +15,7 @@ from lxml import html
from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
get_trailing_data, decode_tbs)
get_trailing_data, decode_tbs, read_font_record)
from calibre.utils.magick.draw import identify_data
def format_bytes(byts):
@ -1154,26 +1154,13 @@ class FontRecord(object): # {{{
def __init__(self, idx, record):
self.raw = record.raw
name = '%06d'%idx
(self.uncompressed_size, self.unknown1, self.unknown2) = \
struct.unpack_from(b'>LLL', self.raw, 4)
self.payload = self.raw[4:]
self.ext = 'unknown'
if self.unknown1 == 1:
self.zlib_header = self.raw[self.unknown2:self.unknown2+2]
self.payload = zlib.decompress(self.raw[self.unknown2+2:-4], -15)
hdr = self.payload[:4]
if hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
self.ext = 'ttf'
if self.uncompressed_size != len(self.payload):
raise ValueError('Font record uncompressed size mismatch',
' expected: %d actual: %d'%(self.uncompressed_size,
len(self.payload)))
else:
print ('Unknown font record with fields: %s' %
[self.uncompressed_size, self.unknown1, self.unknown2])
print ('\tAdditional fields: %s'%((
struct.unpack_from(b'>LL', self.raw, 16),)))
self.name = '%s.%s'%(name, self.ext)
self.font = read_font_record(self.raw)
if self.font['err']:
raise ValueError('Failed to read font record: %s Headers: %s'%(
self.font['err'], self.font['headers']))
self.payload = (self.font['font_data'] if self.font['font_data'] else
self.font['raw_data'])
self.name = '%s.%s'%(name, self.font['ext'])
def dump(self, folder):
with open(os.path.join(folder, self.name), 'wb') as f:

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os, zlib, imghdr
import struct, re, os, imghdr
from collections import namedtuple
from itertools import repeat
@ -16,6 +16,7 @@ from calibre.ebooks.mobi.reader.index import read_index
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
from calibre.ebooks.mobi.utils import read_font_record
Part = namedtuple('Part',
'num type filename start end aid')
@ -339,39 +340,16 @@ class Mobi8Reader(object):
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
pass # Ignore these records
elif typ == b'FONT':
# fonts only exist in K8 ebooks
# Format:
# bytes 0 - 3: 'FONT'
# bytes 4 - 7: ?? Expanded size in bytes ??
# bytes 8 - 11: ?? number of files ??
# bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24)
# bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib?
# The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end
try:
fields = struct.unpack_from(b'>LLLLL', data, 4)
except:
fields = None
# self.log.debug('Font record fields: %s'%(fields,))
cdata = data[26:-4]
ext = 'dat'
try:
uncompressed_data = zlib.decompress(cdata, -15)
except:
self.log.warn('Failed to uncompress embedded font %d: '
'Fields: %s' % (fname_idx, fields,))
uncompressed_data = data[4:]
ext = 'failed'
if len(uncompressed_data) < 200:
self.log.warn('Failed to uncompress embedded font %d: '
'Fields: %s' % (fname_idx, fields,))
uncompressed_data = data[4:]
ext = 'failed'
hdr = uncompressed_data[:4]
if ext != 'failed' and hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
ext = 'ttf'
href = "fonts/%05d.%s" % (fname_idx, ext)
font = read_font_record(data)
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
if font['err']:
self.log.warn('Reading font record %d failed: %s'%(
fname_idx, font['err']))
if font['headers']:
self.log.debug('Font record headers: %s'%font['headers'])
with open(href.replace('/', os.sep), 'wb') as f:
f.write(uncompressed_data)
f.write(font['font_data'] if font['font_data'] else
font['raw_data'])
else:
imgtype = imghdr.what(None, data)
if imgtype is None:

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, string, imghdr
import struct, string, imghdr, zlib
from collections import OrderedDict
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
@ -373,4 +373,116 @@ def mobify_image(data):
data = im.export('gif')
return data
def read_zlib_header(header):
header = bytearray(header)
# See sec 2.2 of RFC 1950 for the zlib stream format
# http://www.ietf.org/rfc/rfc1950.txt
if (header[0]*256 + header[1])%31 != 0:
return None, 'Bad zlib header, FCHECK failed'
cmf = header[0] & 0b1111
cinfo = header[0] >> 4
if cmf != 8:
return None, 'Unknown zlib compression method: %d'%cmf
if cinfo > 7:
return None, 'Invalid CINFO field in zlib header: %d'%cinfo
fdict = (header[1]&0b10000)>>5
if fdict != 0:
return None, 'FDICT based zlib compression not supported'
wbits = cinfo + 8
return wbits, None
def read_font_record(data, extent=1040): # {{{
'''
Return the font encoded in the MOBI FONT record represented by data.
The return value in a dict with fields raw_data, font_data, err, ext,
headers.
:param extent: The number of obfuscated bytes. So far I have only
encountered files with 1040 obfuscated bytes. If you encounter an
obfuscated record for which this function fails, try different extent
values (easily automated).
raw_data is the raw data in the font record
font_data is the decoded font_data or None if an error occurred
err is not None if some error occurred
ext is the font type (ttf for TrueType, dat for unknown and failed if an
error occurred)
headers is the list of decoded headers from the font record or None if
decoding failed
'''
# Format:
# bytes 0 - 3: 'FONT'
# bytes 4 - 7: Uncompressed size
# bytes 8 - 11: flags
# bit 1 - zlib compression
# bit 2 - XOR obfuscated
# bytes 12 - 15: offset to start of compressed data
# bytes 16 - 19: length of XOR string
# bytes 19 - 23: offset to start of XOR data
# The zlib compressed data begins with 2 bytes of header and
# has 4 bytes of checksum at the end
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
'headers':None}
try:
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
b'>LLLLL', data, 4)
except:
ans['err'] = 'Failed to read font record header fields'
return ans
font_data = data[dstart:]
ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
'xor_start':xor_start, 'dstart':dstart}
if flags & 0b10:
# De-obfuscate the data
key = bytearray(data[xor_start:xor_start+xor_len])
buf = bytearray(font_data)
extent = len(font_data) if extent is None else extent
extent = min(extent, len(font_data))
for n in xrange(extent):
buf[n] ^= key[n%xor_len] # XOR of buf and key
font_data = bytes(buf)
if flags & 0b1:
# ZLIB compressed data
wbits, err = read_zlib_header(font_data[:2])
if err is not None:
ans['err'] = err
return ans
adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4)
try:
# remove two bytes of zlib header and 4 bytes of trailing checksum
# negative wbits indicates no standard gzip header
font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
except Exception as e:
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
return ans
if len(font_data) != usize:
ans['err'] = 'Uncompressed font size mismatch'
return ans
if False:
# For some reason these almost never match, probably Amazon has a
# buggy Adler32 implementation
sig = (zlib.adler32(font_data) & 0xffffffff)
if sig != adler32:
ans['err'] = ('Adler checksum did not match. Stored: %d '
'Calculated: %d')%(adler32, sig)
return ans
ans['font_data'] = font_data
ans['ext'] = ('ttf' if font_data[:4] in {b'\0\1\0\0', b'true', b'ttcf'}
else 'dat')
return ans
# }}}