Add preliminary support for extracting FONT records to inspect mobi

This commit is contained in:
Kovid Goyal 2012-03-13 13:05:14 +05:30
parent 1ec08cf8c6
commit 41f168413b
3 changed files with 44 additions and 12 deletions

View File

@ -52,7 +52,7 @@ class MOBIInput(InputFormatPlugin):
mr.extract_content(u'.', parse_cache) mr.extract_content(u'.', parse_cache)
if mr.kf8_type is not None: if mr.kf8_type is not None:
log('Found KF8 MOBI of type %s'%mr.kf8_type) log('Found KF8 MOBI of type %r'%mr.kf8_type)
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
return os.path.abspath(Mobi8Reader(mr, log)()) return os.path.abspath(Mobi8Reader(mr, log)())

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct, datetime, sys, os, shutil import struct, datetime, sys, os, shutil, zlib
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from lxml import html from lxml import html
@ -1149,6 +1149,32 @@ class BinaryRecord(object): # {{{
# }}} # }}}
class FontRecord(object): # {{{
def __init__(self, idx, record):
self.raw = record.raw
name = '%06d'%idx
(self.uncompressed_size, self.unknown1, self.unknown2) = \
struct.unpack_from(b'>LLL', self.raw, 4)
self.payload = self.raw[4:]
self.ext = 'unknown'
if self.unknown1 == 1:
self.zlib_header = self.raw[self.unknown2:self.unknown2+2]
self.payload = zlib.decompress(self.raw[self.unknown2+2:-4], -15)
hdr = self.payload[:4]
if hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
self.ext = 'ttf'
else:
print ('Unknown font record with fields: %s' %
[self.uncompressed_size, self.unknown1, self.unknown2])
self.name = '%s.%s'%(name, self.ext)
def dump(self, folder):
with open(os.path.join(folder, self.name), 'wb') as f:
f.write(self.payload)
# }}}
class TBSIndexing(object): # {{{ class TBSIndexing(object): # {{{
def __init__(self, text_records, indices, doc_type): def __init__(self, text_records, indices, doc_type):
@ -1410,6 +1436,7 @@ class MOBIFile(object): # {{{
self.mobi_header.extra_data_flags, decompress) for r in xrange(1, self.mobi_header.extra_data_flags, decompress) for r in xrange(1,
min(len(self.records), ntr+1))] min(len(self.records), ntr+1))]
self.image_records, self.binary_records = [], [] self.image_records, self.binary_records = [], []
self.font_records = []
image_index = 0 image_index = 0
for i in xrange(fntbr, len(self.records)): for i in xrange(fntbr, len(self.records)):
if i in self.indexing_record_nums or i in self.huffman_record_nums: if i in self.indexing_record_nums or i in self.huffman_record_nums:
@ -1419,13 +1446,15 @@ class MOBIFile(object): # {{{
fmt = None fmt = None
if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS', if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS',
b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP',
b'AUDI', b'VIDE'}: b'AUDI', b'VIDE', b'FONT'}:
try: try:
width, height, fmt = identify_data(r.raw) width, height, fmt = identify_data(r.raw)
except: except:
pass pass
if fmt is not None: if fmt is not None:
self.image_records.append(ImageRecord(image_index, r, fmt)) self.image_records.append(ImageRecord(image_index, r, fmt))
elif r.raw[:4] == b'FONT':
self.font_records.append(FontRecord(i, r))
else: else:
self.binary_records.append(BinaryRecord(i, r)) self.binary_records.append(BinaryRecord(i, r))
@ -1465,10 +1494,11 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{
of.write(rec.raw) of.write(rec.raw)
alltext += rec.raw alltext += rec.raw
of.seek(0) of.seek(0)
root = html.fromstring(alltext.decode('utf-8')) if f.mobi_header.file_version < 8:
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: root = html.fromstring(alltext.decode('utf-8'))
of.write(html.tostring(root, pretty_print=True, encoding='utf-8', with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
include_meta_content_type=True)) of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
include_meta_content_type=True))
if f.index_header is not None: if f.index_header is not None:
@ -1490,7 +1520,7 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{
f.tbs_indexing.dump(ddir) f.tbs_indexing.dump(ddir)
for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), for tdir, attr in [('text', 'text_records'), ('images', 'image_records'),
('binary', 'binary_records')]: ('binary', 'binary_records'), ('font', 'font_records')]:
tdir = os.path.join(ddir, tdir) tdir = os.path.join(ddir, tdir)
os.mkdir(tdir) os.mkdir(tdir)
for rec in getattr(f, attr): for rec in getattr(f, attr):

View File

@ -351,7 +351,7 @@ class Mobi8Reader(object):
fields = struct.unpack_from(b'>LLLL', data, 4) fields = struct.unpack_from(b'>LLLL', data, 4)
except: except:
fields = None fields = None
#self.log.debug('Font record fields: %s'%(fields,)) # self.log.debug('Font record fields: %s'%(fields,))
cdata = data[26:-4] cdata = data[26:-4]
ext = 'dat' ext = 'dat'
try: try:
@ -361,11 +361,13 @@ class Mobi8Reader(object):
'Fields: %s' % (fname_idx, fields,)) 'Fields: %s' % (fname_idx, fields,))
uncompressed_data = data[4:] uncompressed_data = data[4:]
ext = 'failed' ext = 'failed'
hdr = uncompressed_data[0:4]
if len(uncompressed_data) < 200: if len(uncompressed_data) < 200:
self.log.warn('Corrupted font record: %d'%fname_idx) self.log.warn('Failed to uncompress embedded font %d: '
'Fields: %s' % (fname_idx, fields,))
uncompressed_data = data[4:]
ext = 'failed' ext = 'failed'
if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf': hdr = uncompressed_data[:4]
if ext != 'failed' and hdr in {b'\0\1\0\0', b'true', b'ttcf'}:
ext = 'ttf' ext = 'ttf'
href = "fonts/%05d.%s" % (fname_idx, ext) href = "fonts/%05d.%s" % (fname_idx, ext)
with open(href.replace('/', os.sep), 'wb') as f: with open(href.replace('/', os.sep), 'wb') as f: