mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-21 22:13:04 -05:00
1240 lines
48 KiB
Python
1240 lines
48 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import (unicode_literals, division, absolute_import,
|
|
print_function)
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import struct, datetime, sys, os, shutil
|
|
from collections import OrderedDict, defaultdict
|
|
from calibre.utils.date import utc_tz
|
|
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
|
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
|
get_trailing_data)
|
|
from calibre.utils.magick.draw import identify_data
|
|
|
|
# PalmDB {{{
|
|
class PalmDOCAttributes(object):
|
|
|
|
class Attr(object):
|
|
|
|
def __init__(self, name, field, val):
|
|
self.name = name
|
|
self.val = val & field
|
|
|
|
def __str__(self):
|
|
return '%s: %s'%(self.name, bool(self.val))
|
|
|
|
def __init__(self, raw):
|
|
self.val = struct.unpack(b'<H', raw)[0]
|
|
self.attributes = []
|
|
for name, field in [('Read Only', 0x02), ('Dirty AppInfoArea', 0x04),
|
|
('Backup this database', 0x08),
|
|
('Okay to install newer over existing copy, if present on PalmPilot', 0x10),
|
|
('Force the PalmPilot to reset after this database is installed', 0x12),
|
|
('Don\'t allow copy of file to be beamed to other Pilot',
|
|
0x14)]:
|
|
self.attributes.append(PalmDOCAttributes.Attr(name, field,
|
|
self.val))
|
|
|
|
def __str__(self):
|
|
attrs = '\n\t'.join([str(x) for x in self.attributes])
|
|
return 'PalmDOC Attributes: %s\n\t%s'%(bin(self.val), attrs)
|
|
|
|
class PalmDB(object):
|
|
|
|
def __init__(self, raw):
|
|
self.raw = raw
|
|
|
|
if self.raw.startswith(b'TPZ'):
|
|
raise ValueError('This is a Topaz file')
|
|
|
|
self.name = self.raw[:32].replace(b'\x00', b'')
|
|
self.attributes = PalmDOCAttributes(self.raw[32:34])
|
|
self.version = struct.unpack(b'>H', self.raw[34:36])[0]
|
|
|
|
palm_epoch = datetime.datetime(1904, 1, 1, tzinfo=utc_tz)
|
|
self.creation_date_raw = struct.unpack(b'>I', self.raw[36:40])[0]
|
|
self.creation_date = (palm_epoch +
|
|
datetime.timedelta(seconds=self.creation_date_raw))
|
|
self.modification_date_raw = struct.unpack(b'>I', self.raw[40:44])[0]
|
|
self.modification_date = (palm_epoch +
|
|
datetime.timedelta(seconds=self.modification_date_raw))
|
|
self.last_backup_date_raw = struct.unpack(b'>I', self.raw[44:48])[0]
|
|
self.last_backup_date = (palm_epoch +
|
|
datetime.timedelta(seconds=self.last_backup_date_raw))
|
|
self.modification_number = struct.unpack(b'>I', self.raw[48:52])[0]
|
|
self.app_info_id = self.raw[52:56]
|
|
self.sort_info_id = self.raw[56:60]
|
|
self.type = self.raw[60:64]
|
|
self.creator = self.raw[64:68]
|
|
self.ident = self.type + self.creator
|
|
if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
|
|
raise ValueError('Unknown book ident: %r'%self.ident)
|
|
self.uid_seed = self.raw[68:72]
|
|
self.next_rec_list_id = self.raw[72:76]
|
|
|
|
self.number_of_records, = struct.unpack(b'>H', self.raw[76:78])
|
|
|
|
def __str__(self):
|
|
ans = ['*'*20 + ' PalmDB Header '+ '*'*20]
|
|
ans.append('Name: %r'%self.name)
|
|
ans.append(str(self.attributes))
|
|
ans.append('Version: %s'%self.version)
|
|
ans.append('Creation date: %s (%s)'%(self.creation_date.isoformat(),
|
|
self.creation_date_raw))
|
|
ans.append('Modification date: %s (%s)'%(self.modification_date.isoformat(),
|
|
self.modification_date_raw))
|
|
ans.append('Backup date: %s (%s)'%(self.last_backup_date.isoformat(),
|
|
self.last_backup_date_raw))
|
|
ans.append('Modification number: %s'%self.modification_number)
|
|
ans.append('App Info ID: %r'%self.app_info_id)
|
|
ans.append('Sort Info ID: %r'%self.sort_info_id)
|
|
ans.append('Type: %r'%self.type)
|
|
ans.append('Creator: %r'%self.creator)
|
|
ans.append('UID seed: %r'%self.uid_seed)
|
|
ans.append('Next record list id: %r'%self.next_rec_list_id)
|
|
ans.append('Number of records: %s'%self.number_of_records)
|
|
|
|
return '\n'.join(ans)
|
|
# }}}
|
|
|
|
class Record(object): # {{{
|
|
|
|
def __init__(self, raw, header):
|
|
self.offset, self.flags, self.uid = header
|
|
self.raw = raw
|
|
|
|
@property
|
|
def header(self):
|
|
return 'Offset: %d Flags: %d UID: %d First 4 bytes: %r Size: %d'%(self.offset, self.flags,
|
|
self.uid, self.raw[:4], len(self.raw))
|
|
# }}}
|
|
|
|
# EXTH {{{
|
|
class EXTHRecord(object):
|
|
|
|
def __init__(self, type_, data):
|
|
self.type = type_
|
|
self.data = data
|
|
self.name = {
|
|
1 : 'DRM Server id',
|
|
2 : 'DRM Commerce id',
|
|
3 : 'DRM ebookbase book id',
|
|
100 : 'author',
|
|
101 : 'publisher',
|
|
102 : 'imprint',
|
|
103 : 'description',
|
|
104 : 'isbn',
|
|
105 : 'subject',
|
|
106 : 'publishingdate',
|
|
107 : 'review',
|
|
108 : 'contributor',
|
|
109 : 'rights',
|
|
110 : 'subjectcode',
|
|
111 : 'type',
|
|
112 : 'source',
|
|
113 : 'asin',
|
|
114 : 'versionnumber',
|
|
115 : 'sample',
|
|
116 : 'startreading',
|
|
117 : 'adult',
|
|
118 : 'retailprice',
|
|
119 : 'retailpricecurrency',
|
|
201 : 'coveroffset',
|
|
202 : 'thumboffset',
|
|
203 : 'hasfakecover',
|
|
204 : 'Creator Software',
|
|
205 : 'Creator Major Version', # '>I'
|
|
206 : 'Creator Minor Version', # '>I'
|
|
207 : 'Creator Build Number', # '>I'
|
|
208 : 'watermark',
|
|
209 : 'tamper_proof_keys',
|
|
300 : 'fontsignature',
|
|
301 : 'clippinglimit', # percentage '>B'
|
|
402 : 'publisherlimit',
|
|
404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled
|
|
501 : 'cdetype', # 4 chars (PDOC or EBOK)
|
|
502 : 'lastupdatetime',
|
|
503 : 'updatedtitle',
|
|
}.get(self.type, repr(self.type))
|
|
|
|
if self.name in ('coveroffset', 'thumboffset', 'hasfakecover',
|
|
'Creator Major Version', 'Creator Minor Version',
|
|
'Creator Build Number', 'Creator Software', 'startreading'):
|
|
self.data, = struct.unpack(b'>I', self.data)
|
|
|
|
def __str__(self):
|
|
return '%s (%d): %r'%(self.name, self.type, self.data)
|
|
|
|
class EXTHHeader(object):
|
|
|
|
def __init__(self, raw):
|
|
self.raw = raw
|
|
if not self.raw.startswith(b'EXTH'):
|
|
raise ValueError('EXTH header does not start with EXTH')
|
|
self.length, = struct.unpack(b'>I', self.raw[4:8])
|
|
self.count, = struct.unpack(b'>I', self.raw[8:12])
|
|
|
|
pos = 12
|
|
self.records = []
|
|
for i in xrange(self.count):
|
|
pos = self.read_record(pos)
|
|
|
|
def read_record(self, pos):
|
|
type_, length = struct.unpack(b'>II', self.raw[pos:pos+8])
|
|
data = self.raw[(pos+8):(pos+length)]
|
|
self.records.append(EXTHRecord(type_, data))
|
|
return pos + length
|
|
|
|
def __str__(self):
|
|
ans = ['*'*20 + ' EXTH Header '+ '*'*20]
|
|
ans.append('EXTH header length: %d'%self.length)
|
|
ans.append('Number of EXTH records: %d'%self.count)
|
|
ans.append('EXTH records...')
|
|
for r in self.records:
|
|
ans.append(str(r))
|
|
return '\n'.join(ans)
|
|
# }}}
|
|
|
|
class MOBIHeader(object): # {{{
|
|
|
|
def __init__(self, record0):
|
|
self.raw = record0.raw
|
|
|
|
self.compression_raw = self.raw[:2]
|
|
self.compression = {1: 'No compression', 2: 'PalmDoc compression',
|
|
17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H',
|
|
self.compression_raw)[0],
|
|
repr(self.compression_raw))
|
|
self.unused = self.raw[2:4]
|
|
self.text_length, = struct.unpack(b'>I', self.raw[4:8])
|
|
self.number_of_text_records, self.text_record_size = \
|
|
struct.unpack(b'>HH', self.raw[8:12])
|
|
self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14])
|
|
self.encryption_type = {0: 'No encryption',
|
|
1: 'Old mobipocket encryption',
|
|
2:'Mobipocket encryption'}.get(self.encryption_type_raw,
|
|
repr(self.encryption_type_raw))
|
|
self.unknown = self.raw[14:16]
|
|
|
|
self.identifier = self.raw[16:20]
|
|
if self.identifier != b'MOBI':
|
|
raise ValueError('Identifier %r unknown'%self.identifier)
|
|
|
|
self.length, = struct.unpack(b'>I', self.raw[20:24])
|
|
self.type_raw, = struct.unpack(b'>I', self.raw[24:28])
|
|
self.type = {
|
|
2 : 'Mobipocket book',
|
|
3 : 'PalmDOC book',
|
|
4 : 'Audio',
|
|
257 : 'News',
|
|
258 : 'News Feed',
|
|
259 : 'News magazine',
|
|
513 : 'PICS',
|
|
514 : 'Word',
|
|
515 : 'XLS',
|
|
516 : 'PPT',
|
|
517 : 'TEXT',
|
|
518 : 'HTML',
|
|
}.get(self.type_raw, repr(self.type_raw))
|
|
|
|
self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32])
|
|
self.encoding = {
|
|
1252 : 'cp1252',
|
|
65001: 'utf-8',
|
|
}.get(self.encoding_raw, repr(self.encoding_raw))
|
|
self.uid = self.raw[32:36]
|
|
self.file_version = struct.unpack(b'>I', self.raw[36:40])
|
|
self.reserved = self.raw[40:48]
|
|
self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52])
|
|
self.reserved2 = self.raw[52:80]
|
|
self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84])
|
|
self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88])
|
|
self.fullname_length, = struct.unpack(b'>I', self.raw[88:92])
|
|
self.locale_raw, = struct.unpack(b'>I', self.raw[92:96])
|
|
langcode = self.locale_raw
|
|
langid = langcode & 0xFF
|
|
sublangid = (langcode >> 10) & 0xFF
|
|
self.language = main_language.get(langid, 'ENGLISH')
|
|
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
|
|
|
self.input_language = self.raw[96:100]
|
|
self.output_langauage = self.raw[100:104]
|
|
self.min_version, = struct.unpack(b'>I', self.raw[104:108])
|
|
self.first_image_index, = struct.unpack(b'>I', self.raw[108:112])
|
|
self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116])
|
|
self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120])
|
|
self.unknown2 = self.raw[120:128]
|
|
self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
|
|
self.has_exth = bool(self.exth_flags & 0x40)
|
|
self.has_drm_data = self.length >= 174 and len(self.raw) >= 180
|
|
if self.has_drm_data:
|
|
self.unknown3 = self.raw[132:164]
|
|
self.drm_offset, = struct.unpack(b'>I', self.raw[164:168])
|
|
self.drm_count, = struct.unpack(b'>I', self.raw[168:172])
|
|
self.drm_size, = struct.unpack(b'>I', self.raw[172:176])
|
|
self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0])
|
|
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
|
|
self.has_fcis_flis = False
|
|
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
|
|
self.extra_data_flags = 0
|
|
if self.has_extra_data_flags:
|
|
self.unknown4 = self.raw[180:192]
|
|
self.first_content_record, self.last_content_record = \
|
|
struct.unpack(b'>HH', self.raw[192:196])
|
|
self.unknown5, = struct.unpack(b'>I', self.raw[196:200])
|
|
(self.fcis_number, self.fcis_count, self.flis_number,
|
|
self.flis_count) = struct.unpack(b'>IIII',
|
|
self.raw[200:216])
|
|
self.unknown6 = self.raw[216:240]
|
|
self.extra_data_flags = struct.unpack(b'>I',
|
|
self.raw[240:244])[0]
|
|
self.has_multibytes = bool(self.extra_data_flags & 0b1)
|
|
self.has_indexing_bytes = bool(self.extra_data_flags & 0b10)
|
|
self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100)
|
|
self.primary_index_record, = struct.unpack(b'>I',
|
|
self.raw[244:248])
|
|
|
|
if self.has_exth:
|
|
self.exth_offset = 16 + self.length
|
|
|
|
self.exth = EXTHHeader(self.raw[self.exth_offset:])
|
|
|
|
self.end_of_exth = self.exth_offset + self.exth.length
|
|
self.bytes_after_exth = self.fullname_offset - self.end_of_exth
|
|
|
|
def __str__(self):
|
|
ans = ['*'*20 + ' MOBI Header '+ '*'*20]
|
|
ans.append('Compression: %s'%self.compression)
|
|
ans.append('Unused: %r'%self.unused)
|
|
ans.append('Number of text records: %d'%self.number_of_text_records)
|
|
ans.append('Text record size: %d'%self.text_record_size)
|
|
ans.append('Encryption: %s'%self.encryption_type)
|
|
ans.append('Unknown: %r'%self.unknown)
|
|
ans.append('Identifier: %r'%self.identifier)
|
|
ans.append('Header length: %d'% self.length)
|
|
ans.append('Type: %s'%self.type)
|
|
ans.append('Encoding: %s'%self.encoding)
|
|
ans.append('UID: %r'%self.uid)
|
|
ans.append('File version: %d'%self.file_version)
|
|
ans.append('Reserved: %r'%self.reserved)
|
|
ans.append('Secondary index record: %d (null val: %d)'%(
|
|
self.secondary_index_record, 0xffffffff))
|
|
ans.append('Reserved2: %r'%self.reserved2)
|
|
ans.append('First non-book record (null value: %d): %d'%(0xffffffff,
|
|
self.first_non_book_record))
|
|
ans.append('Full name offset: %d'%self.fullname_offset)
|
|
ans.append('Full name length: %d bytes'%self.fullname_length)
|
|
ans.append('Langcode: %r'%self.locale_raw)
|
|
ans.append('Language: %s'%self.language)
|
|
ans.append('Sub language: %s'%self.sublanguage)
|
|
ans.append('Input language: %r'%self.input_language)
|
|
ans.append('Output language: %r'%self.output_langauage)
|
|
ans.append('Min version: %d'%self.min_version)
|
|
ans.append('First Image index: %d'%self.first_image_index)
|
|
ans.append('Huffman record offset: %d'%self.huffman_record_offset)
|
|
ans.append('Huffman record count: %d'%self.huffman_record_count)
|
|
ans.append('Unknown2: %r'%self.unknown2)
|
|
ans.append('EXTH flags: %r (%s)'%(self.exth_flags, self.has_exth))
|
|
if self.has_drm_data:
|
|
ans.append('Unknown3: %r'%self.unknown3)
|
|
ans.append('DRM Offset: %s'%self.drm_offset)
|
|
ans.append('DRM Count: %s'%self.drm_count)
|
|
ans.append('DRM Size: %s'%self.drm_size)
|
|
ans.append('DRM Flags: %r'%self.drm_flags)
|
|
if self.has_extra_data_flags:
|
|
ans.append('Unknown4: %r'%self.unknown4)
|
|
ans.append('First content record: %d'% self.first_content_record)
|
|
ans.append('Last content record: %d'% self.last_content_record)
|
|
ans.append('Unknown5: %d'% self.unknown5)
|
|
ans.append('FCIS number: %d'% self.fcis_number)
|
|
ans.append('FCIS count: %d'% self.fcis_count)
|
|
ans.append('FLIS number: %d'% self.flis_number)
|
|
ans.append('FLIS count: %d'% self.flis_count)
|
|
ans.append('Unknown6: %r'% self.unknown6)
|
|
ans.append(('Extra data flags: %s (has multibyte: %s) '
|
|
'(has indexing: %s) (has uncrossable breaks: %s)')%(
|
|
bin(self.extra_data_flags), self.has_multibytes,
|
|
self.has_indexing_bytes, self.has_uncrossable_breaks ))
|
|
ans.append('Primary index record (null value: %d): %d'%(0xffffffff,
|
|
self.primary_index_record))
|
|
|
|
ans = '\n'.join(ans)
|
|
|
|
if self.has_exth:
|
|
ans += '\n\n' + str(self.exth)
|
|
ans += '\n\nBytes after EXTH: %d'%self.bytes_after_exth
|
|
|
|
ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset +
|
|
self.fullname_length))
|
|
|
|
ans += '\nRecord 0 length: %d'%len(self.raw)
|
|
return ans
|
|
# }}}
|
|
|
|
class TagX(object): # {{{
|
|
|
|
def __init__(self, raw, control_byte_count):
|
|
self.tag = ord(raw[0])
|
|
self.num_values = ord(raw[1])
|
|
self.bitmask = ord(raw[2])
|
|
# End of file = 1 iff last entry
|
|
# When it is 1 all others are 0
|
|
self.eof = ord(raw[3])
|
|
|
|
self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
|
|
and self.bitmask == 0)
|
|
|
|
def __repr__(self):
|
|
return 'TAGX(tag=%02d, num_values=%d, bitmask=%r, eof=%d)' % (self.tag,
|
|
self.num_values, bin(self.bitmask), self.eof)
|
|
# }}}
|
|
|
|
class IndexHeader(object): # {{{
|
|
|
|
def __init__(self, record):
|
|
self.record = record
|
|
raw = self.record.raw
|
|
if raw[:4] != b'INDX':
|
|
raise ValueError('Invalid Primary Index Record')
|
|
|
|
self.header_length, = struct.unpack('>I', raw[4:8])
|
|
self.unknown1 = raw[8:16]
|
|
self.index_type, = struct.unpack('>I', raw[16:20])
|
|
self.index_type_desc = {0: 'normal', 2:
|
|
'inflection'}.get(self.index_type, 'unknown')
|
|
self.idxt_start, = struct.unpack('>I', raw[20:24])
|
|
self.index_count, = struct.unpack('>I', raw[24:28])
|
|
self.index_encoding_num, = struct.unpack('>I', raw[28:32])
|
|
self.index_encoding = {65001: 'utf-8', 1252:
|
|
'cp1252'}.get(self.index_encoding_num, 'unknown')
|
|
if self.index_encoding == 'unknown':
|
|
raise ValueError(
|
|
'Unknown index encoding: %d'%self.index_encoding_num)
|
|
self.locale_raw, = struct.unpack(b'>I', raw[32:36])
|
|
langcode = self.locale_raw
|
|
langid = langcode & 0xFF
|
|
sublangid = (langcode >> 10) & 0xFF
|
|
self.language = main_language.get(langid, 'ENGLISH')
|
|
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
|
|
self.num_index_entries, = struct.unpack('>I', raw[36:40])
|
|
self.ordt_start, = struct.unpack('>I', raw[40:44])
|
|
self.ligt_start, = struct.unpack('>I', raw[44:48])
|
|
self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52])
|
|
self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56])
|
|
self.unknown2 = raw[56:180]
|
|
self.tagx_offset, = struct.unpack(b'>I', raw[180:184])
|
|
if self.tagx_offset != self.header_length:
|
|
raise ValueError('TAGX offset and header length disagree')
|
|
self.unknown3 = raw[184:self.header_length]
|
|
|
|
tagx = raw[self.header_length:]
|
|
if not tagx.startswith(b'TAGX'):
|
|
raise ValueError('Invalid TAGX section')
|
|
self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
|
|
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
|
|
tag_table = tagx[12:self.tagx_header_length]
|
|
if len(tag_table) % 4 != 0:
|
|
raise ValueError('Invalid Tag table')
|
|
num_tagx_entries = len(tag_table) // 4
|
|
self.tagx_entries = []
|
|
for i in range(num_tagx_entries):
|
|
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4],
|
|
self.tagx_control_byte_count))
|
|
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
|
raise ValueError('TAGX last entry is not EOF')
|
|
self.tagx_entries = self.tagx_entries[:-1]
|
|
|
|
idxt0_pos = self.header_length+self.tagx_header_length
|
|
last_num, consumed = decode_hex_number(raw[idxt0_pos:])
|
|
count_pos = idxt0_pos + consumed
|
|
self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2])
|
|
|
|
if last_num != self.ncx_count - 1:
|
|
raise ValueError('Last id number in the NCX != NCX count - 1')
|
|
# There may be some alignment zero bytes between the end of the idxt0
|
|
# and self.idxt_start
|
|
|
|
idxt = raw[self.idxt_start:]
|
|
if idxt[:4] != b'IDXT':
|
|
raise ValueError('Invalid IDXT header')
|
|
length_check, = struct.unpack(b'>H', idxt[4:6])
|
|
if length_check != self.header_length + self.tagx_header_length:
|
|
raise ValueError('Length check failed')
|
|
|
|
def __str__(self):
|
|
ans = ['*'*20 + ' Index Header '+ '*'*20]
|
|
a = ans.append
|
|
def u(w):
|
|
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
|
len(w), not bool(w.replace(b'\0', b'')) ))
|
|
|
|
a('Header length: %d'%self.header_length)
|
|
u(self.unknown1)
|
|
a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
|
|
a('Offset to IDXT start: %d'%self.idxt_start)
|
|
a('Number of index records: %d'%self.index_count)
|
|
a('Index encoding: %s (%d)'%(self.index_encoding,
|
|
self.index_encoding_num))
|
|
a('Index language: %s - %s (%s)'%(self.language, self.sublanguage,
|
|
hex(self.locale_raw)))
|
|
a('Number of index entries: %d'% self.num_index_entries)
|
|
a('ORDT start: %d'%self.ordt_start)
|
|
a('LIGT start: %d'%self.ligt_start)
|
|
a('Number of LIGT entries: %d'%self.num_of_ligt_entries)
|
|
a('Number of cncx blocks: %d'%self.num_of_cncx_blocks)
|
|
u(self.unknown2)
|
|
a('TAGX offset: %d'%self.tagx_offset)
|
|
u(self.unknown3)
|
|
a('\n\n')
|
|
a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20)
|
|
a('Header length: %d'%self.tagx_header_length)
|
|
a('Control byte count: %d'%self.tagx_control_byte_count)
|
|
for i in self.tagx_entries:
|
|
a('\t' + repr(i))
|
|
a('Number of entries in the NCX: %d'% self.ncx_count)
|
|
|
|
return '\n'.join(ans)
|
|
# }}}
|
|
|
|
class Tag(object): # {{{
|
|
|
|
'''
|
|
Index entries are a collection of tags. Each tag is represented by this
|
|
class.
|
|
'''
|
|
|
|
TAG_MAP = {
|
|
1: ('offset', 'Offset in HTML'),
|
|
2: ('size', 'Size in HTML'),
|
|
3: ('label_offset', 'Offset to label in CNCX'),
|
|
4: ('depth', 'Depth of this entry in TOC'),
|
|
|
|
# The remaining tag types have to be interpreted subject to the type
|
|
# of index entry they are present in
|
|
}
|
|
|
|
INTERPRET_MAP = {
|
|
'subchapter': {
|
|
5 : ('Parent chapter index', 'parent_index')
|
|
},
|
|
|
|
'article' : {
|
|
5 : ('Class offset in cncx', 'class_offset'),
|
|
21 : ('Parent section index', 'parent_index'),
|
|
22 : ('Description offset in cncx', 'desc_offset'),
|
|
23 : ('Author offset in cncx', 'author_offset'),
|
|
},
|
|
|
|
'chapter_with_subchapters' : {
|
|
22 : ('First subchapter index', 'first_child_index'),
|
|
23 : ('Last subchapter index', 'last_child_index'),
|
|
},
|
|
|
|
'periodical' : {
|
|
5 : ('Class offset in cncx', 'class_offset'),
|
|
22 : ('First section index', 'first_child_index'),
|
|
23 : ('Last section index', 'last_child_index'),
|
|
},
|
|
|
|
'section' : {
|
|
5 : ('Class offset in cncx', 'class_offset'),
|
|
21 : ('Periodical index', 'parent_index'),
|
|
22 : ('First article index', 'first_child_index'),
|
|
23 : ('Last article index', 'last_child_index'),
|
|
},
|
|
}
|
|
|
|
|
|
def __init__(self, tagx, vals, entry_type, cncx):
|
|
self.value = vals if len(vals) > 1 else vals[0]
|
|
self.entry_type = entry_type
|
|
self.cncx_value = None
|
|
if tagx.tag in self.TAG_MAP:
|
|
self.attr, self.desc = self.TAG_MAP[tagx.tag]
|
|
else:
|
|
try:
|
|
td = self.INTERPRET_MAP[entry_type]
|
|
except:
|
|
raise ValueError('Unknown entry type: %s'%entry_type)
|
|
try:
|
|
self.desc, self.attr = td[tagx.tag]
|
|
except:
|
|
raise ValueError('Unknown tag: %d for entry type: %s'%(
|
|
tagx.tag, entry_type))
|
|
if '_offset' in self.attr:
|
|
self.cncx_value = cncx[self.value]
|
|
|
|
def __str__(self):
|
|
if self.cncx_value is not None:
|
|
return '%s : %r [%r]'%(self.desc, self.value, self.cncx_value)
|
|
return '%s : %r'%(self.desc, self.value)
|
|
|
|
# }}}
|
|
|
|
class IndexEntry(object): # {{{
|
|
|
|
'''
|
|
The index is made up of entries, each of which is represented by an
|
|
instance of this class. Index entries typically point to offsets int eh
|
|
HTML, specify HTML sizes and point to text strings in the CNCX that are
|
|
used in the navigation UI.
|
|
'''
|
|
|
|
TYPES = {
|
|
# Present in book type files
|
|
0x0f : 'chapter',
|
|
0x6f : 'chapter_with_subchapters',
|
|
0x1f : 'subchapter',
|
|
# Present in periodicals
|
|
0xdf : 'periodical',
|
|
0xff : 'section',
|
|
0x3f : 'article',
|
|
}
|
|
|
|
def __init__(self, ident, entry_type, raw, cncx, tagx_entries):
|
|
self.index = ident
|
|
self.raw = raw
|
|
self.tags = []
|
|
|
|
try:
|
|
self.entry_type = self.TYPES[entry_type]
|
|
except KeyError:
|
|
raise ValueError('Unknown Index Entry type: %s'%hex(entry_type))
|
|
|
|
expected_tags = [tag for tag in tagx_entries if tag.bitmask &
|
|
entry_type]
|
|
|
|
for tag in expected_tags:
|
|
vals = []
|
|
for i in range(tag.num_values):
|
|
if not raw:
|
|
raise ValueError('Index entry does not match TAGX header')
|
|
val, consumed = decint(raw)
|
|
raw = raw[consumed:]
|
|
vals.append(val)
|
|
self.tags.append(Tag(tag, vals, self.entry_type, cncx))
|
|
|
|
@property
|
|
def label(self):
|
|
for tag in self.tags:
|
|
if tag.attr == 'label_offset':
|
|
return tag.cncx_value
|
|
return ''
|
|
|
|
@property
|
|
def offset(self):
|
|
for tag in self.tags:
|
|
if tag.attr == 'offset':
|
|
return tag.value
|
|
return 0
|
|
|
|
@property
|
|
def size(self):
|
|
for tag in self.tags:
|
|
if tag.attr == 'size':
|
|
return tag.value
|
|
return 0
|
|
|
|
@property
|
|
def depth(self):
|
|
for tag in self.tags:
|
|
if tag.attr == 'depth':
|
|
return tag.value
|
|
return 0
|
|
|
|
@property
|
|
def parent_index(self):
|
|
for tag in self.tags:
|
|
if tag.attr == 'parent_index':
|
|
return tag.value
|
|
return -1
|
|
|
|
@property
|
|
def first_child_index(self):
|
|
for tag in self.tags:
|
|
if tag.attr == 'first_child_index':
|
|
return tag.value
|
|
return -1
|
|
|
|
@property
|
|
def last_child_index(self):
|
|
for tag in self.tags:
|
|
if tag.attr == 'last_child_index':
|
|
return tag.value
|
|
return -1
|
|
|
|
def __str__(self):
|
|
ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
|
|
self.index, self.entry_type, len(self.tags))]
|
|
for tag in self.tags:
|
|
ans.append('\t'+str(tag))
|
|
if self.first_child_index != -1:
|
|
ans.append('\tNumber of children: %d'%(self.last_child_index -
|
|
self.first_child_index + 1))
|
|
return '\n'.join(ans)
|
|
|
|
# }}}
|
|
|
|
class IndexRecord(object): # {{{
|
|
|
|
'''
|
|
Represents all indexing information in the MOBI, apart from indexing info
|
|
in the trailing data of the text records.
|
|
'''
|
|
|
|
def __init__(self, record, index_header, cncx):
|
|
self.record = record
|
|
raw = self.record.raw
|
|
if raw[:4] != b'INDX':
|
|
raise ValueError('Invalid Primary Index Record')
|
|
|
|
u = struct.unpack
|
|
|
|
self.header_length, = u('>I', raw[4:8])
|
|
self.unknown1 = raw[8:12]
|
|
self.header_type, = u('>I', raw[12:16])
|
|
self.unknown2 = raw[16:20]
|
|
self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
|
|
if self.idxt_offset < 192:
|
|
raise ValueError('Unknown Index record structure')
|
|
self.unknown3 = raw[28:36]
|
|
self.unknown4 = raw[36:192] # Should be 156 bytes
|
|
|
|
self.index_offsets = []
|
|
indices = raw[self.idxt_offset:]
|
|
if indices[:4] != b'IDXT':
|
|
raise ValueError("Invalid IDXT index table")
|
|
indices = indices[4:]
|
|
for i in range(self.idxt_count):
|
|
off, = u(b'>H', indices[i*2:(i+1)*2])
|
|
self.index_offsets.append(off-192)
|
|
|
|
indxt = raw[192:self.idxt_offset]
|
|
self.indices = []
|
|
for i, off in enumerate(self.index_offsets):
|
|
try:
|
|
next_off = self.index_offsets[i+1]
|
|
except:
|
|
next_off = len(indxt)
|
|
index, consumed = decode_hex_number(indxt[off:])
|
|
entry_type = ord(indxt[off+consumed])
|
|
self.indices.append(IndexEntry(index, entry_type,
|
|
indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries))
|
|
index = self.indices[-1]
|
|
|
|
def get_parent(self, index):
|
|
if index.depth < 1:
|
|
return None
|
|
parent_depth = index.depth - 1
|
|
for p in self.indices:
|
|
if p.depth != parent_depth:
|
|
continue
|
|
|
|
|
|
def __str__(self):
|
|
ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
|
|
a = ans.append
|
|
def u(w):
|
|
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
|
len(w), not bool(w.replace(b'\0', b'')) ))
|
|
a('Header length: %d'%self.header_length)
|
|
u(self.unknown1)
|
|
a('Header Type: %d'%self.header_type)
|
|
u(self.unknown2)
|
|
a('IDXT Offset: %d'%self.idxt_offset)
|
|
a('IDXT Count: %d'%self.idxt_count)
|
|
u(self.unknown3)
|
|
u(self.unknown4)
|
|
a('Index offsets: %r'%self.index_offsets)
|
|
a('\nIndex Entries:')
|
|
for entry in self.indices:
|
|
a(str(entry)+'\n')
|
|
|
|
return '\n'.join(ans)
|
|
|
|
# }}}
|
|
|
|
class CNCX(object) : # {{{
|
|
|
|
'''
|
|
Parses the records that contain the compiled NCX (all strings from the
|
|
NCX). Presents a simple offset : string mapping interface to access the
|
|
data.
|
|
'''
|
|
|
|
def __init__(self, records, codec):
|
|
self.records = OrderedDict()
|
|
pos = 0
|
|
for record in records:
|
|
raw = record.raw
|
|
while pos < len(raw):
|
|
length, consumed = decint(raw[pos:])
|
|
if length > 0:
|
|
self.records[pos] = raw[pos+consumed:pos+consumed+length].decode(
|
|
codec)
|
|
pos += consumed+length
|
|
|
|
def __getitem__(self, offset):
|
|
return self.records.get(offset)
|
|
|
|
def __str__(self):
|
|
ans = ['*'*20 + ' cncx (%d strings) '%len(self.records)+ '*'*20]
|
|
for k, v in self.records.iteritems():
|
|
ans.append('%10d : %s'%(k, v))
|
|
return '\n'.join(ans)
|
|
|
|
|
|
# }}}
|
|
|
|
class TextRecord(object): # {{{
|
|
|
|
def __init__(self, idx, record, extra_data_flags, decompress):
|
|
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
|
|
self.raw = decompress(self.raw)
|
|
if 0 in self.trailing_data:
|
|
self.trailing_data['multibyte_overlap'] = self.trailing_data.pop(0)
|
|
if 1 in self.trailing_data:
|
|
self.trailing_data['indexing'] = self.trailing_data.pop(1)
|
|
if 2 in self.trailing_data:
|
|
self.trailing_data['uncrossable_breaks'] = self.trailing_data.pop(2)
|
|
|
|
self.idx = idx
|
|
|
|
def dump(self, folder):
|
|
name = '%06d'%self.idx
|
|
with open(os.path.join(folder, name+'.txt'), 'wb') as f:
|
|
f.write(self.raw)
|
|
with open(os.path.join(folder, name+'.trailing_data'), 'wb') as f:
|
|
for k, v in self.trailing_data.iteritems():
|
|
raw = '%s : %r\n\n'%(k, v)
|
|
f.write(raw.encode('utf-8'))
|
|
|
|
# }}}
|
|
|
|
class ImageRecord(object): # {{{
|
|
|
|
def __init__(self, idx, record, fmt):
|
|
self.raw = record.raw
|
|
self.fmt = fmt
|
|
self.idx = idx
|
|
|
|
def dump(self, folder):
|
|
name = '%06d'%self.idx
|
|
with open(os.path.join(folder, name+'.'+self.fmt), 'wb') as f:
|
|
f.write(self.raw)
|
|
|
|
# }}}
|
|
|
|
class BinaryRecord(object): # {{{
|
|
|
|
def __init__(self, idx, record):
|
|
self.raw = record.raw
|
|
sig = self.raw[:4]
|
|
name = '%06d'%idx
|
|
if sig in (b'FCIS', b'FLIS', b'SRCS'):
|
|
name += '-' + sig.decode('ascii')
|
|
elif sig == b'\xe9\x8e\r\n':
|
|
name += '-' + 'EOF'
|
|
self.name = name
|
|
|
|
def dump(self, folder):
|
|
with open(os.path.join(folder, self.name+'.bin'), 'wb') as f:
|
|
f.write(self.raw)
|
|
|
|
# }}}
|
|
|
|
class TBSIndexing(object): # {{{
|
|
|
|
def __init__(self, text_records, indices, doc_type):
|
|
self.record_indices = OrderedDict()
|
|
self.doc_type = doc_type
|
|
self.indices = indices
|
|
pos = 0
|
|
for r in text_records:
|
|
start = pos
|
|
pos += len(r.raw)
|
|
end = pos - 1
|
|
self.record_indices[r] = x = {'starts':[], 'ends':[],
|
|
'complete':[], 'geom': (start, end)}
|
|
for entry in indices:
|
|
istart, sz = entry.offset, entry.size
|
|
iend = istart + sz - 1
|
|
has_start = istart >= start and istart <= end
|
|
has_end = iend >= start and iend <= end
|
|
rec = None
|
|
if has_start and has_end:
|
|
rec = 'complete'
|
|
elif has_start and not has_end:
|
|
rec = 'starts'
|
|
elif not has_start and has_end:
|
|
rec = 'ends'
|
|
if rec:
|
|
x[rec].append(entry)
|
|
|
|
def get_index(self, idx):
|
|
for i in self.indices:
|
|
if i.index == idx: return i
|
|
raise IndexError('Index %d not found'%idx)
|
|
|
|
def __str__(self):
|
|
ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20]
|
|
for r, dat in self.record_indices.iteritems():
|
|
ans += self.dump_record(r, dat)[-1]
|
|
return '\n'.join(ans)
|
|
|
|
def dump(self, bdir):
|
|
types = defaultdict(list)
|
|
for r, dat in self.record_indices.iteritems():
|
|
tbs_type, strings = self.dump_record(r, dat)
|
|
if tbs_type == 0: continue
|
|
types[tbs_type] += strings
|
|
for typ, strings in types.iteritems():
|
|
with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f:
|
|
f.write('\n'.join(strings))
|
|
|
|
def dump_record(self, r, dat):
|
|
ans = []
|
|
ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx,
|
|
dat['geom'][0], dat['geom'][1]))
|
|
s, e, c = dat['starts'], dat['ends'], dat['complete']
|
|
ans.append(('\tContains: %d index entries '
|
|
'(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e,
|
|
c, s))))
|
|
byts = bytearray(r.trailing_data.get('indexing', b''))
|
|
sbyts = tuple(hex(b)[2:] for b in byts)
|
|
ans.append('TBS bytes: %s'%(' '.join(sbyts)))
|
|
for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)):
|
|
if entries:
|
|
ans.append('\t%s:'%typ)
|
|
for x in entries:
|
|
ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
|
|
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
|
|
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
|
|
def bin3(num):
|
|
ans = bin(num)[2:]
|
|
return '0'*(3-len(ans)) + ans
|
|
|
|
tbs_type = 0
|
|
if len(byts):
|
|
outer, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
tbs_type = outer & 0b111
|
|
ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type))
|
|
ans.append('Outer Index entry: %d'%(outer >> 3))
|
|
arg1, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
ans.append('Unknown (vwi: always 0?): %d'%arg1)
|
|
if self.doc_type in (257, 259): # Hierarchical periodical
|
|
byts, a = self.interpret_periodical(tbs_type, byts)
|
|
ans += a
|
|
if byts:
|
|
sbyts = tuple(hex(b)[2:] for b in byts)
|
|
ans.append('Remaining bytes: %s'%' '.join(sbyts))
|
|
|
|
ans.append('')
|
|
return tbs_type, ans
|
|
|
|
def interpret_periodical(self, tbs_type, byts):
|
|
ans = []
|
|
|
|
def tbs_type_6(byts, psi=None): # {{{
|
|
if psi is None:
|
|
# Assume parent section is 1
|
|
psi = self.get_index(1)
|
|
if byts:
|
|
# byts could be empty
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
flags = (arg & 0b1111)
|
|
ai = (arg >> 4)
|
|
ans.append(('Article index at start of record or first article'
|
|
' index, relative to parent section (fvwi): %d [%d absolute]'%(ai,
|
|
ai+psi.index)))
|
|
if flags == 1:
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
ans.append('EOF (vwi: should be 0): %d'%arg)
|
|
elif flags == 4:
|
|
num = byts[0]
|
|
byts = byts[1:]
|
|
ans.append('Number of article nodes in the record (byte): %d'%num)
|
|
elif flags == 0:
|
|
pass
|
|
else:
|
|
raise ValueError('Unknown flags: %d'%flags)
|
|
return byts
|
|
|
|
# }}}
|
|
|
|
if tbs_type == 3: # {{{
|
|
if byts:
|
|
arg2, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
ans.append('Unknown: %d'%arg2)
|
|
if byts:
|
|
arg3, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
fsi = arg3 >> 4
|
|
extra = arg3 & 0b1111
|
|
ans.append('First section index: %d'%fsi)
|
|
psi = self.get_index(fsi)
|
|
ans.append('Extra bits: %d'%extra)
|
|
if byts:
|
|
if byts[0] == fsi:
|
|
ssi = psi.index+1
|
|
ans.append('First section ends')
|
|
byts = byts[1:]
|
|
arg, consumed = decint(byts)
|
|
raw = byts[:consumed]
|
|
byts = byts[consumed:]
|
|
flags = arg & 0b1111
|
|
ans.append('Unknown (art index at start of record?):'
|
|
' %d %r'%((arg>>4), raw))
|
|
ans.append('Flags: %d'%flags)
|
|
num = 1
|
|
if flags >= 4:
|
|
num = byts[0]
|
|
byts = byts[1:]
|
|
ans.append('Number of articles in closing section: %d'%num)
|
|
if flags == 5:
|
|
arg, consumed = decint(byts)
|
|
ans.append('Unknown: %r'%bytes(byts[:consumed]))
|
|
byts = byts[consumed:]
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
off = arg >> 4
|
|
ans.append('Last article of ending section w.r.t. starting'
|
|
' section offset: %d [%d absolute]'%(off,
|
|
ssi+off))
|
|
ans.append('Extra bits: %d'%(arg & 0b1111))
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
off = arg >> 4
|
|
flag = arg & 0b1111
|
|
ans.append('Offset to first article of starting section: %d'
|
|
' [%d absolute]'%(off, ssi+off))
|
|
ans.append('Flags: %d'%flag)
|
|
num = 1
|
|
if flag == 4:
|
|
num = byts[0]
|
|
byts = byts[1:]
|
|
ans.append('Number of articles in starting section: %d'%num)
|
|
else:
|
|
ans.append('First section starts')
|
|
off, consumed = decint(byts)
|
|
flags = off & 0b1111
|
|
off = off >> 4
|
|
byts = byts[consumed:]
|
|
ans.append('Article at start of block as offset from '
|
|
'parent index: %d [%d absolute]'%(off, psi.index+off))
|
|
ans.append('Flags: %d'%flags)
|
|
if flags == 4:
|
|
ans.append('Number of articles: %d'%byts[0])
|
|
byts = byts[1:]
|
|
# }}}
|
|
|
|
elif tbs_type == 7: # {{{
|
|
# This occurs for records that have no section nodes and
|
|
# whose parent section's index == 1
|
|
ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2]))
|
|
byts = byts[2:]
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
ai = arg >> 4
|
|
flags = arg & 0b1111
|
|
ans.append('Article at start of record (fvwi): %d'%ai)
|
|
if flags == 4:
|
|
num = byts[0]
|
|
byts = byts[1:]
|
|
ans.append('Number of articles in record (byte): %d'%num)
|
|
elif flags == 0:
|
|
pass
|
|
elif flags == 1:
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
ans.append('EOF (vwi: should be 0): %d'%arg)
|
|
else:
|
|
raise ValueError('Unknown flags value: %d'%flags)
|
|
# }}}
|
|
|
|
elif tbs_type == 6: # {{{
|
|
# This is used for records spanned by an article whose parent
|
|
# section's index == 1 or for the opening record if it contains the
|
|
# periodical start, section 1 start and at least one article. The
|
|
# two cases are distinguished by the flags on the article index
|
|
# vwi.
|
|
unk = byts[0]
|
|
byts = byts[1:]
|
|
ans.append('Unknown (byte: always 2?): %d'%unk)
|
|
byts = tbs_type_6(byts)
|
|
# }}}
|
|
|
|
elif tbs_type == 2: # {{{
|
|
# This occurs for records with no section nodes and whose parent
|
|
# section's index != 1 (undefined (records before the first
|
|
# section) or > 1)
|
|
# This is also used for records that are spanned by an article
|
|
# whose parent section index > 1. In this case the flags of the
|
|
# vwi referring to the article at the start
|
|
# of the record are set to 1 instead of 4.
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
flags = (arg & 0b1111)
|
|
psi = (arg >> 4)
|
|
ans.append('Parent section index (fvwi): %d'%psi)
|
|
psi = self.get_index(psi)
|
|
ans.append('Flags: %d'%flags)
|
|
if flags == 1:
|
|
arg, consumed = decint(byts)
|
|
byts = byts[consumed:]
|
|
ans.append('Unknown (vwi?: always 0?): %d'%arg)
|
|
byts = tbs_type_6(byts, psi=psi)
|
|
elif flags == 0:
|
|
byts = tbs_type_6(byts, psi=psi)
|
|
else:
|
|
raise ValueError('Unkown flags: %d'%flags)
|
|
# }}}
|
|
|
|
return byts, ans
|
|
|
|
# }}}
|
|
|
|
class MOBIFile(object): # {{{
|
|
|
|
def __init__(self, stream):
|
|
self.raw = stream.read()
|
|
|
|
self.palmdb = PalmDB(self.raw[:78])
|
|
|
|
self.record_headers = []
|
|
self.records = []
|
|
for i in xrange(self.palmdb.number_of_records):
|
|
pos = 78 + i * 8
|
|
offset, a1, a2, a3, a4 = struct.unpack(b'>LBBBB', self.raw[pos:pos+8])
|
|
flags, val = a1, a2 << 16 | a3 << 8 | a4
|
|
self.record_headers.append((offset, flags, val))
|
|
|
|
def section(section_number):
|
|
if section_number == self.palmdb.number_of_records - 1:
|
|
end_off = len(self.raw)
|
|
else:
|
|
end_off = self.record_headers[section_number + 1][0]
|
|
off = self.record_headers[section_number][0]
|
|
return self.raw[off:end_off]
|
|
|
|
for i in range(self.palmdb.number_of_records):
|
|
self.records.append(Record(section(i), self.record_headers[i]))
|
|
|
|
self.mobi_header = MOBIHeader(self.records[0])
|
|
|
|
if 'huff' in self.mobi_header.compression.lower():
|
|
huffrecs = [r.raw for r in
|
|
xrange(self.mobi_header.huffman_record_offset,
|
|
self.mobi_header.huffman_record_offset +
|
|
self.mobi_header.huffman_record_count)]
|
|
from calibre.ebooks.mobi.huffcdic import HuffReader
|
|
huffs = HuffReader(huffrecs)
|
|
decompress = huffs.decompress
|
|
elif 'palmdoc' in self.mobi_header.compression.lower():
|
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
|
decompress = decompress_doc
|
|
else:
|
|
decompress = lambda x: x
|
|
|
|
self.index_header = self.index_record = None
|
|
self.indexing_record_nums = set()
|
|
pir = self.mobi_header.primary_index_record
|
|
if pir != 0xffffffff:
|
|
self.index_header = IndexHeader(self.records[pir])
|
|
self.cncx = CNCX(self.records[
|
|
pir+2:pir+2+self.index_header.num_of_cncx_blocks],
|
|
self.index_header.index_encoding)
|
|
self.index_record = IndexRecord(self.records[pir+1],
|
|
self.index_header, self.cncx)
|
|
self.indexing_record_nums = set(xrange(pir,
|
|
pir+2+self.index_header.num_of_cncx_blocks))
|
|
|
|
|
|
ntr = self.mobi_header.number_of_text_records
|
|
fntbr = self.mobi_header.first_non_book_record
|
|
fii = self.mobi_header.first_image_index
|
|
if fntbr == 0xffffffff:
|
|
fntbr = len(self.records)
|
|
self.text_records = [TextRecord(r, self.records[r],
|
|
self.mobi_header.extra_data_flags, decompress) for r in xrange(1,
|
|
min(len(self.records), ntr+1))]
|
|
self.image_records, self.binary_records = [], []
|
|
for i in xrange(fntbr, len(self.records)):
|
|
if i in self.indexing_record_nums:
|
|
continue
|
|
r = self.records[i]
|
|
fmt = None
|
|
if i >= fii and r.raw[:4] not in (b'FLIS', b'FCIS', b'SRCS',
|
|
b'\xe9\x8e\r\n'):
|
|
try:
|
|
width, height, fmt = identify_data(r.raw)
|
|
except:
|
|
pass
|
|
if fmt is not None:
|
|
self.image_records.append(ImageRecord(i, r, fmt))
|
|
else:
|
|
self.binary_records.append(BinaryRecord(i, r))
|
|
|
|
if self.index_record is not None:
|
|
self.tbs_indexing = TBSIndexing(self.text_records,
|
|
self.index_record.indices, self.mobi_header.type_raw)
|
|
|
|
def print_header(self, f=sys.stdout):
|
|
print (str(self.palmdb).encode('utf-8'), file=f)
|
|
print (file=f)
|
|
print ('Record headers:', file=f)
|
|
for i, r in enumerate(self.records):
|
|
print ('%6d. %s'%(i, r.header), file=f)
|
|
|
|
print (file=f)
|
|
print (str(self.mobi_header).encode('utf-8'), file=f)
|
|
# }}}
|
|
|
|
def inspect_mobi(path_or_stream, prefix='decompiled'):
|
|
stream = (path_or_stream if hasattr(path_or_stream, 'read') else
|
|
open(path_or_stream, 'rb'))
|
|
f = MOBIFile(stream)
|
|
ddir = prefix + '_' + os.path.splitext(os.path.basename(stream.name))[0]
|
|
try:
|
|
shutil.rmtree(ddir)
|
|
except:
|
|
pass
|
|
os.mkdir(ddir)
|
|
with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
|
|
f.print_header(f=out)
|
|
if f.index_header is not None:
|
|
with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
|
|
print(str(f.index_header), file=out)
|
|
print('\n\n', file=out)
|
|
print(str(f.cncx).encode('utf-8'), file=out)
|
|
print('\n\n', file=out)
|
|
print(str(f.index_record), file=out)
|
|
with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out:
|
|
print(str(f.tbs_indexing), file=out)
|
|
f.tbs_indexing.dump(ddir)
|
|
|
|
for tdir, attr in [('text', 'text_records'), ('images', 'image_records'),
|
|
('binary', 'binary_records')]:
|
|
tdir = os.path.join(ddir, tdir)
|
|
os.mkdir(tdir)
|
|
for rec in getattr(f, attr):
|
|
rec.dump(tdir)
|
|
|
|
print ('Debug data saved to:', ddir)
|
|
|
|
def main():
|
|
inspect_mobi(sys.argv[1])
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|