mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
KF8 Input: Add support for KF8 files with obfuscated embedded fonts
This commit is contained in:
parent
cdfb1214e9
commit
a8d46f2f40
@ -34,10 +34,12 @@ class MOBIInput(InputFormatPlugin):
|
||||
accelerators):
|
||||
|
||||
if os.environ.get('USE_MOBIUNPACK', None) is not None:
|
||||
pos = stream.tell()
|
||||
try:
|
||||
return run_mobi_unpack(stream, options, log, accelerators)
|
||||
except Exception:
|
||||
log.exception('mobi_unpack code not working')
|
||||
stream.seek(pos)
|
||||
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from lxml import html
|
||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, datetime, sys, os, shutil, zlib
|
||||
import struct, datetime, sys, os, shutil
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
from lxml import html
|
||||
@ -15,7 +15,7 @@ from lxml import html
|
||||
from calibre.utils.date import utc_tz
|
||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
||||
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
||||
get_trailing_data, decode_tbs)
|
||||
get_trailing_data, decode_tbs, read_font_record)
|
||||
from calibre.utils.magick.draw import identify_data
|
||||
|
||||
def format_bytes(byts):
|
||||
@ -1154,26 +1154,13 @@ class FontRecord(object): # {{{
|
||||
def __init__(self, idx, record):
|
||||
self.raw = record.raw
|
||||
name = '%06d'%idx
|
||||
(self.uncompressed_size, self.unknown1, self.unknown2) = \
|
||||
struct.unpack_from(b'>LLL', self.raw, 4)
|
||||
self.payload = self.raw[4:]
|
||||
self.ext = 'unknown'
|
||||
if self.unknown1 == 1:
|
||||
self.zlib_header = self.raw[self.unknown2:self.unknown2+2]
|
||||
self.payload = zlib.decompress(self.raw[self.unknown2+2:-4], -15)
|
||||
hdr = self.payload[:4]
|
||||
if hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
|
||||
self.ext = 'ttf'
|
||||
if self.uncompressed_size != len(self.payload):
|
||||
raise ValueError('Font record uncompressed size mismatch',
|
||||
' expected: %d actual: %d'%(self.uncompressed_size,
|
||||
len(self.payload)))
|
||||
else:
|
||||
print ('Unknown font record with fields: %s' %
|
||||
[self.uncompressed_size, self.unknown1, self.unknown2])
|
||||
print ('\tAdditional fields: %s'%((
|
||||
struct.unpack_from(b'>LL', self.raw, 16),)))
|
||||
self.name = '%s.%s'%(name, self.ext)
|
||||
self.font = read_font_record(self.raw)
|
||||
if self.font['err']:
|
||||
raise ValueError('Failed to read font record: %s Headers: %s'%(
|
||||
self.font['err'], self.font['headers']))
|
||||
self.payload = (self.font['font_data'] if self.font['font_data'] else
|
||||
self.font['raw_data'])
|
||||
self.name = '%s.%s'%(name, self.font['ext'])
|
||||
|
||||
def dump(self, folder):
|
||||
with open(os.path.join(folder, self.name), 'wb') as f:
|
||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, re, os, zlib, imghdr
|
||||
import struct, re, os, imghdr
|
||||
from collections import namedtuple
|
||||
from itertools import repeat
|
||||
|
||||
@ -16,6 +16,7 @@ from calibre.ebooks.mobi.reader.index import read_index
|
||||
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
|
||||
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
|
||||
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
|
||||
from calibre.ebooks.mobi.utils import read_font_record
|
||||
|
||||
Part = namedtuple('Part',
|
||||
'num type filename start end aid')
|
||||
@ -339,39 +340,16 @@ class Mobi8Reader(object):
|
||||
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
|
||||
pass # Ignore these records
|
||||
elif typ == b'FONT':
|
||||
# fonts only exist in K8 ebooks
|
||||
# Format:
|
||||
# bytes 0 - 3: 'FONT'
|
||||
# bytes 4 - 7: ?? Expanded size in bytes ??
|
||||
# bytes 8 - 11: ?? number of files ??
|
||||
# bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24)
|
||||
# bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib?
|
||||
# The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end
|
||||
try:
|
||||
fields = struct.unpack_from(b'>LLLLL', data, 4)
|
||||
except:
|
||||
fields = None
|
||||
# self.log.debug('Font record fields: %s'%(fields,))
|
||||
cdata = data[26:-4]
|
||||
ext = 'dat'
|
||||
try:
|
||||
uncompressed_data = zlib.decompress(cdata, -15)
|
||||
except:
|
||||
self.log.warn('Failed to uncompress embedded font %d: '
|
||||
'Fields: %s' % (fname_idx, fields,))
|
||||
uncompressed_data = data[4:]
|
||||
ext = 'failed'
|
||||
if len(uncompressed_data) < 200:
|
||||
self.log.warn('Failed to uncompress embedded font %d: '
|
||||
'Fields: %s' % (fname_idx, fields,))
|
||||
uncompressed_data = data[4:]
|
||||
ext = 'failed'
|
||||
hdr = uncompressed_data[:4]
|
||||
if ext != 'failed' and hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
|
||||
ext = 'ttf'
|
||||
href = "fonts/%05d.%s" % (fname_idx, ext)
|
||||
font = read_font_record(data)
|
||||
href = "fonts/%05d.%s" % (fname_idx, font['ext'])
|
||||
if font['err']:
|
||||
self.log.warn('Reading font record %d failed: %s'%(
|
||||
fname_idx, font['err']))
|
||||
if font['headers']:
|
||||
self.log.debug('Font record headers: %s'%font['headers'])
|
||||
with open(href.replace('/', os.sep), 'wb') as f:
|
||||
f.write(uncompressed_data)
|
||||
f.write(font['font_data'] if font['font_data'] else
|
||||
font['raw_data'])
|
||||
else:
|
||||
imgtype = imghdr.what(None, data)
|
||||
if imgtype is None:
|
||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import struct, string, imghdr
|
||||
import struct, string, imghdr, zlib
|
||||
from collections import OrderedDict
|
||||
|
||||
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
||||
@ -373,4 +373,116 @@ def mobify_image(data):
|
||||
data = im.export('gif')
|
||||
return data
|
||||
|
||||
def read_zlib_header(header):
|
||||
header = bytearray(header)
|
||||
# See sec 2.2 of RFC 1950 for the zlib stream format
|
||||
# http://www.ietf.org/rfc/rfc1950.txt
|
||||
if (header[0]*256 + header[1])%31 != 0:
|
||||
return None, 'Bad zlib header, FCHECK failed'
|
||||
|
||||
cmf = header[0] & 0b1111
|
||||
cinfo = header[0] >> 4
|
||||
if cmf != 8:
|
||||
return None, 'Unknown zlib compression method: %d'%cmf
|
||||
if cinfo > 7:
|
||||
return None, 'Invalid CINFO field in zlib header: %d'%cinfo
|
||||
fdict = (header[1]&0b10000)>>5
|
||||
if fdict != 0:
|
||||
return None, 'FDICT based zlib compression not supported'
|
||||
wbits = cinfo + 8
|
||||
return wbits, None
|
||||
|
||||
|
||||
def read_font_record(data, extent=1040): # {{{
|
||||
'''
|
||||
Return the font encoded in the MOBI FONT record represented by data.
|
||||
The return value in a dict with fields raw_data, font_data, err, ext,
|
||||
headers.
|
||||
|
||||
:param extent: The number of obfuscated bytes. So far I have only
|
||||
encountered files with 1040 obfuscated bytes. If you encounter an
|
||||
obfuscated record for which this function fails, try different extent
|
||||
values (easily automated).
|
||||
|
||||
raw_data is the raw data in the font record
|
||||
font_data is the decoded font_data or None if an error occurred
|
||||
err is not None if some error occurred
|
||||
ext is the font type (ttf for TrueType, dat for unknown and failed if an
|
||||
error occurred)
|
||||
headers is the list of decoded headers from the font record or None if
|
||||
decoding failed
|
||||
'''
|
||||
# Format:
|
||||
# bytes 0 - 3: 'FONT'
|
||||
# bytes 4 - 7: Uncompressed size
|
||||
# bytes 8 - 11: flags
|
||||
# bit 1 - zlib compression
|
||||
# bit 2 - XOR obfuscated
|
||||
# bytes 12 - 15: offset to start of compressed data
|
||||
# bytes 16 - 19: length of XOR string
|
||||
# bytes 19 - 23: offset to start of XOR data
|
||||
# The zlib compressed data begins with 2 bytes of header and
|
||||
# has 4 bytes of checksum at the end
|
||||
ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
|
||||
'headers':None}
|
||||
|
||||
try:
|
||||
usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
|
||||
b'>LLLLL', data, 4)
|
||||
except:
|
||||
ans['err'] = 'Failed to read font record header fields'
|
||||
return ans
|
||||
font_data = data[dstart:]
|
||||
ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
|
||||
'xor_start':xor_start, 'dstart':dstart}
|
||||
|
||||
if flags & 0b10:
|
||||
# De-obfuscate the data
|
||||
key = bytearray(data[xor_start:xor_start+xor_len])
|
||||
buf = bytearray(font_data)
|
||||
extent = len(font_data) if extent is None else extent
|
||||
extent = min(extent, len(font_data))
|
||||
|
||||
for n in xrange(extent):
|
||||
buf[n] ^= key[n%xor_len] # XOR of buf and key
|
||||
|
||||
font_data = bytes(buf)
|
||||
|
||||
if flags & 0b1:
|
||||
# ZLIB compressed data
|
||||
wbits, err = read_zlib_header(font_data[:2])
|
||||
if err is not None:
|
||||
ans['err'] = err
|
||||
return ans
|
||||
adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4)
|
||||
try:
|
||||
# remove two bytes of zlib header and 4 bytes of trailing checksum
|
||||
# negative wbits indicates no standard gzip header
|
||||
font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
|
||||
except Exception as e:
|
||||
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
|
||||
return ans
|
||||
|
||||
if len(font_data) != usize:
|
||||
ans['err'] = 'Uncompressed font size mismatch'
|
||||
return ans
|
||||
|
||||
if False:
|
||||
# For some reason these almost never match, probably Amazon has a
|
||||
# buggy Adler32 implementation
|
||||
sig = (zlib.adler32(font_data) & 0xffffffff)
|
||||
if sig != adler32:
|
||||
ans['err'] = ('Adler checksum did not match. Stored: %d '
|
||||
'Calculated: %d')%(adler32, sig)
|
||||
return ans
|
||||
|
||||
ans['font_data'] = font_data
|
||||
ans['ext'] = ('ttf' if font_data[:4] in {b'\0\1\0\0', b'true', b'ttcf'}
|
||||
else 'dat')
|
||||
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user