mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-25 07:48:55 -04:00 
			
		
		
		
	KF8 Input: Add support for KF8 files with obfuscated embedded fonts
This commit is contained in:
		
							parent
							
								
									cdfb1214e9
								
							
						
					
					
						commit
						a8d46f2f40
					
				| @ -34,10 +34,12 @@ class MOBIInput(InputFormatPlugin): | ||||
|                 accelerators): | ||||
| 
 | ||||
|         if os.environ.get('USE_MOBIUNPACK', None) is not None: | ||||
|             pos = stream.tell() | ||||
|             try: | ||||
|                 return run_mobi_unpack(stream, options, log, accelerators) | ||||
|             except Exception: | ||||
|                 log.exception('mobi_unpack code not working') | ||||
|             stream.seek(pos) | ||||
| 
 | ||||
|         from calibre.ebooks.mobi.reader.mobi6 import MobiReader | ||||
|         from lxml import html | ||||
|  | ||||
| @ -7,7 +7,7 @@ __license__   = 'GPL v3' | ||||
| __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' | ||||
| __docformat__ = 'restructuredtext en' | ||||
| 
 | ||||
| import struct, datetime, sys, os, shutil, zlib | ||||
| import struct, datetime, sys, os, shutil | ||||
| from collections import OrderedDict, defaultdict | ||||
| 
 | ||||
| from lxml import html | ||||
| @ -15,7 +15,7 @@ from lxml import html | ||||
| from calibre.utils.date import utc_tz | ||||
| from calibre.ebooks.mobi.langcodes import main_language, sub_language | ||||
| from calibre.ebooks.mobi.utils import (decode_hex_number, decint, | ||||
|         get_trailing_data, decode_tbs) | ||||
|         get_trailing_data, decode_tbs, read_font_record) | ||||
| from calibre.utils.magick.draw import identify_data | ||||
| 
 | ||||
| def format_bytes(byts): | ||||
| @ -1154,26 +1154,13 @@ class FontRecord(object): # {{{ | ||||
|     def __init__(self, idx, record): | ||||
|         self.raw = record.raw | ||||
|         name = '%06d'%idx | ||||
|         (self.uncompressed_size, self.unknown1, self.unknown2) = \ | ||||
|                 struct.unpack_from(b'>LLL', self.raw, 4) | ||||
|         self.payload = self.raw[4:] | ||||
|         self.ext = 'unknown' | ||||
|         if self.unknown1 == 1: | ||||
|             self.zlib_header = self.raw[self.unknown2:self.unknown2+2] | ||||
|             self.payload = zlib.decompress(self.raw[self.unknown2+2:-4], -15) | ||||
|             hdr = self.payload[:4] | ||||
|             if hdr in {b'\0\1\0\0', b'true', b'ttcf'}: | ||||
|                 self.ext = 'ttf' | ||||
|             if self.uncompressed_size != len(self.payload): | ||||
|                 raise ValueError('Font record uncompressed size mismatch', | ||||
|                         ' expected: %d actual: %d'%(self.uncompressed_size, | ||||
|                             len(self.payload))) | ||||
|         else: | ||||
|             print ('Unknown font record with fields: %s' % | ||||
|                     [self.uncompressed_size, self.unknown1, self.unknown2]) | ||||
|             print ('\tAdditional fields: %s'%(( | ||||
|                 struct.unpack_from(b'>LL', self.raw, 16),))) | ||||
|         self.name = '%s.%s'%(name, self.ext) | ||||
|         self.font = read_font_record(self.raw) | ||||
|         if self.font['err']: | ||||
|             raise ValueError('Failed to read font record: %s Headers: %s'%( | ||||
|                 self.font['err'], self.font['headers'])) | ||||
|         self.payload = (self.font['font_data'] if self.font['font_data'] else | ||||
|                 self.font['raw_data']) | ||||
|         self.name = '%s.%s'%(name, self.font['ext']) | ||||
| 
 | ||||
|     def dump(self, folder): | ||||
|         with open(os.path.join(folder, self.name), 'wb') as f: | ||||
|  | ||||
| @ -7,7 +7,7 @@ __license__   = 'GPL v3' | ||||
| __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' | ||||
| __docformat__ = 'restructuredtext en' | ||||
| 
 | ||||
| import struct, re, os, zlib, imghdr | ||||
| import struct, re, os, imghdr | ||||
| from collections import namedtuple | ||||
| from itertools import repeat | ||||
| 
 | ||||
| @ -16,6 +16,7 @@ from calibre.ebooks.mobi.reader.index import read_index | ||||
| from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc | ||||
| from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup | ||||
| from calibre.ebooks.metadata.opf2 import Guide, OPFCreator | ||||
| from calibre.ebooks.mobi.utils import read_font_record | ||||
| 
 | ||||
| Part = namedtuple('Part', | ||||
|     'num type filename start end aid') | ||||
| @ -339,39 +340,16 @@ class Mobi8Reader(object): | ||||
|                     b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: | ||||
|                 pass # Ignore these records | ||||
|             elif typ == b'FONT': | ||||
|                 # fonts only exist in K8 ebooks | ||||
|                 # Format: | ||||
|                 # bytes  0 -  3:  'FONT' | ||||
|                 # bytes  4 -  7:  ?? Expanded size in bytes ?? | ||||
|                 # bytes  8 - 11:  ?? number of files ?? | ||||
|                 # bytes 12 - 15:  ?? offset to start of compressed data ?? (typically 0x00000018 = 24) | ||||
|                 # bytes 16 - 23:  ?? typically all 0x00 ??  Are these compression flags from zlib? | ||||
|                 # The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end | ||||
|                 try: | ||||
|                     fields = struct.unpack_from(b'>LLLLL', data, 4) | ||||
|                 except: | ||||
|                     fields = None | ||||
|                 # self.log.debug('Font record fields: %s'%(fields,)) | ||||
|                 cdata = data[26:-4] | ||||
|                 ext = 'dat' | ||||
|                 try: | ||||
|                     uncompressed_data = zlib.decompress(cdata, -15) | ||||
|                 except: | ||||
|                     self.log.warn('Failed to uncompress embedded font %d: ' | ||||
|                             'Fields: %s' % (fname_idx, fields,)) | ||||
|                     uncompressed_data = data[4:] | ||||
|                     ext = 'failed' | ||||
|                 if len(uncompressed_data) < 200: | ||||
|                     self.log.warn('Failed to uncompress embedded font %d: ' | ||||
|                             'Fields: %s' % (fname_idx, fields,)) | ||||
|                     uncompressed_data = data[4:] | ||||
|                     ext = 'failed' | ||||
|                 hdr = uncompressed_data[:4] | ||||
|                 if ext != 'failed' and hdr in {b'\0\1\0\0', b'true', b'ttcf'}: | ||||
|                     ext = 'ttf' | ||||
|                 href = "fonts/%05d.%s" % (fname_idx, ext) | ||||
|                 font = read_font_record(data) | ||||
|                 href = "fonts/%05d.%s" % (fname_idx, font['ext']) | ||||
|                 if font['err']: | ||||
|                     self.log.warn('Reading font record %d failed: %s'%( | ||||
|                         fname_idx, font['err'])) | ||||
|                     if font['headers']: | ||||
|                         self.log.debug('Font record headers: %s'%font['headers']) | ||||
|                 with open(href.replace('/', os.sep), 'wb') as f: | ||||
|                     f.write(uncompressed_data) | ||||
|                     f.write(font['font_data'] if font['font_data'] else | ||||
|                             font['raw_data']) | ||||
|             else: | ||||
|                 imgtype = imghdr.what(None, data) | ||||
|                 if imgtype is None: | ||||
|  | ||||
| @ -7,7 +7,7 @@ __license__   = 'GPL v3' | ||||
| __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' | ||||
| __docformat__ = 'restructuredtext en' | ||||
| 
 | ||||
| import struct, string, imghdr | ||||
| import struct, string, imghdr, zlib | ||||
| from collections import OrderedDict | ||||
| 
 | ||||
| from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail | ||||
| @ -373,4 +373,116 @@ def mobify_image(data): | ||||
|         data = im.export('gif') | ||||
|     return data | ||||
| 
 | ||||
| def read_zlib_header(header): | ||||
|     header = bytearray(header) | ||||
|     # See sec 2.2 of RFC 1950 for the zlib stream format | ||||
|     # http://www.ietf.org/rfc/rfc1950.txt | ||||
|     if (header[0]*256 + header[1])%31 != 0: | ||||
|         return None, 'Bad zlib header, FCHECK failed' | ||||
| 
 | ||||
|     cmf = header[0] & 0b1111 | ||||
|     cinfo = header[0] >> 4 | ||||
|     if cmf != 8: | ||||
|         return None, 'Unknown zlib compression method: %d'%cmf | ||||
|     if cinfo > 7: | ||||
|         return None, 'Invalid CINFO field in zlib header: %d'%cinfo | ||||
|     fdict = (header[1]&0b10000)>>5 | ||||
|     if fdict != 0: | ||||
|         return None, 'FDICT based zlib compression not supported' | ||||
|     wbits = cinfo + 8 | ||||
|     return wbits, None | ||||
| 
 | ||||
| 
 | ||||
| def read_font_record(data, extent=1040): # {{{ | ||||
|     ''' | ||||
|     Return the font encoded in the MOBI FONT record represented by data. | ||||
|     The return value in a dict with fields raw_data, font_data, err, ext, | ||||
|     headers. | ||||
| 
 | ||||
|     :param extent: The number of obfuscated bytes. So far I have only | ||||
|     encountered files with 1040 obfuscated bytes. If you encounter an | ||||
|     obfuscated record for which this function fails, try different extent | ||||
|     values (easily automated). | ||||
| 
 | ||||
|     raw_data is the raw data in the font record | ||||
|     font_data is the decoded font_data or None if an error occurred | ||||
|     err is not None if some error occurred | ||||
|     ext is the font type (ttf for TrueType, dat for unknown and failed if an | ||||
|     error occurred) | ||||
|     headers is the list of decoded headers from the font record or None if | ||||
|     decoding failed | ||||
|     ''' | ||||
|     # Format: | ||||
|     # bytes  0 -  3:  'FONT' | ||||
|     # bytes  4 -  7:  Uncompressed size | ||||
|     # bytes  8 - 11:  flags | ||||
|     #                   bit 1 - zlib compression | ||||
|     #                   bit 2 - XOR obfuscated | ||||
|     # bytes 12 - 15:  offset to start of compressed data | ||||
|     # bytes 16 - 19:  length of XOR string | ||||
|     # bytes 19 - 23:  offset to start of XOR data | ||||
|     # The zlib compressed data begins with 2 bytes of header and | ||||
|     # has 4 bytes of checksum at the end | ||||
|     ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed', | ||||
|             'headers':None} | ||||
| 
 | ||||
|     try: | ||||
|         usize, flags, dstart, xor_len, xor_start = struct.unpack_from( | ||||
|                 b'>LLLLL', data, 4) | ||||
|     except: | ||||
|         ans['err'] = 'Failed to read font record header fields' | ||||
|         return ans | ||||
|     font_data = data[dstart:] | ||||
|     ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len, | ||||
|             'xor_start':xor_start, 'dstart':dstart} | ||||
| 
 | ||||
|     if flags & 0b10: | ||||
|         # De-obfuscate the data | ||||
|         key = bytearray(data[xor_start:xor_start+xor_len]) | ||||
|         buf = bytearray(font_data) | ||||
|         extent = len(font_data) if extent is None else extent | ||||
|         extent = min(extent, len(font_data)) | ||||
| 
 | ||||
|         for n in xrange(extent): | ||||
|             buf[n] ^= key[n%xor_len] # XOR of buf and key | ||||
| 
 | ||||
|         font_data = bytes(buf) | ||||
| 
 | ||||
|     if flags & 0b1: | ||||
|         # ZLIB compressed data | ||||
|         wbits, err = read_zlib_header(font_data[:2]) | ||||
|         if err is not None: | ||||
|             ans['err'] = err | ||||
|             return ans | ||||
|         adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4) | ||||
|         try: | ||||
|             # remove two bytes of zlib header and 4 bytes of trailing checksum | ||||
|             # negative wbits indicates no standard gzip header | ||||
|             font_data = zlib.decompress(font_data[2:-4], -wbits, usize) | ||||
|         except Exception as e: | ||||
|             ans['err'] = 'Failed to zlib decompress font data (%s)'%e | ||||
|             return ans | ||||
| 
 | ||||
|         if len(font_data) != usize: | ||||
|             ans['err'] = 'Uncompressed font size mismatch' | ||||
|             return ans | ||||
| 
 | ||||
|         if False: | ||||
|             # For some reason these almost never match, probably Amazon has a | ||||
|             # buggy Adler32 implementation | ||||
|             sig = (zlib.adler32(font_data) & 0xffffffff) | ||||
|             if sig != adler32: | ||||
|                 ans['err'] = ('Adler checksum did not match. Stored: %d ' | ||||
|                         'Calculated: %d')%(adler32, sig) | ||||
|                 return ans | ||||
| 
 | ||||
|     ans['font_data'] = font_data | ||||
|     ans['ext'] = ('ttf' if font_data[:4] in {b'\0\1\0\0', b'true', b'ttcf'} | ||||
|                     else 'dat') | ||||
| 
 | ||||
|     return ans | ||||
| # }}} | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user