Refactor inspect MOBI to use the INDX reading code from mobi.reader

This commit is contained in:
Kovid Goyal 2012-03-17 15:31:05 +05:30
parent 91a4bd7d42
commit c87ad6d69f
5 changed files with 193 additions and 337 deletions

View File

@ -15,6 +15,8 @@ from lxml import html
from calibre.utils.date import utc_tz from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import (parse_index_record,
parse_tagx_section)
from calibre.ebooks.mobi.utils import (decode_hex_number, decint, from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
get_trailing_data, decode_tbs, read_font_record) get_trailing_data, decode_tbs, read_font_record)
from calibre.utils.magick.draw import identify_data from calibre.utils.magick.draw import identify_data
@ -405,14 +407,10 @@ class MOBIHeader(object): # {{{
class TagX(object): # {{{ class TagX(object): # {{{
def __init__(self, raw): def __init__(self, tag, num_values, bitmask, eof):
self.tag = ord(raw[0]) self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values,
self.num_values = ord(raw[1]) bitmask, eof)
self.bitmask = ord(raw[2]) self.num_of_values = num_values
# End of file = 1 iff last entry
# When it is 1 all others are 0
self.eof = ord(raw[3])
self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0 self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
and self.bitmask == 0) and self.bitmask == 0)
@ -459,13 +457,7 @@ class SecondaryIndexHeader(object): # {{{
raise ValueError('Invalid TAGX section') raise ValueError('Invalid TAGX section')
self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
tag_table = tagx[12:self.tagx_header_length] self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
if len(tag_table) % 4 != 0:
raise ValueError('Invalid Tag table')
num_tagx_entries = len(tag_table) // 4
self.tagx_entries = []
for i in range(num_tagx_entries):
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
if self.tagx_entries and not self.tagx_entries[-1].is_eof: if self.tagx_entries and not self.tagx_entries[-1].is_eof:
raise ValueError('TAGX last entry is not EOF') raise ValueError('TAGX last entry is not EOF')
@ -533,7 +525,8 @@ class IndexHeader(object): # {{{
raise ValueError('Invalid Primary Index Record') raise ValueError('Invalid Primary Index Record')
self.header_length, = struct.unpack('>I', raw[4:8]) self.header_length, = struct.unpack('>I', raw[4:8])
self.unknown1 = raw[8:16] self.unknown1 = raw[8:12]
self.header_type, = struct.unpack('>I', raw[12:16])
self.index_type, = struct.unpack('>I', raw[16:20]) self.index_type, = struct.unpack('>I', raw[16:20])
self.index_type_desc = {0: 'normal', 2: self.index_type_desc = {0: 'normal', 2:
'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
@ -562,13 +555,7 @@ class IndexHeader(object): # {{{
raise ValueError('Invalid TAGX section') raise ValueError('Invalid TAGX section')
self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
tag_table = tagx[12:self.tagx_header_length] self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
if len(tag_table) % 4 != 0:
raise ValueError('Invalid Tag table')
num_tagx_entries = len(tag_table) // 4
self.tagx_entries = []
for i in range(num_tagx_entries):
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
if self.tagx_entries and not self.tagx_entries[-1].is_eof: if self.tagx_entries and not self.tagx_entries[-1].is_eof:
raise ValueError('TAGX last entry is not EOF') raise ValueError('TAGX last entry is not EOF')
@ -602,6 +589,7 @@ class IndexHeader(object): # {{{
a('Header length: %d'%self.header_length) a('Header length: %d'%self.header_length)
u(self.unknown1) u(self.unknown1)
a('Header type: %d'%self.header_type)
a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
a('Offset to IDXT start: %d'%self.idxt_start) a('Offset to IDXT start: %d'%self.idxt_start)
a('Number of index records: %d'%self.index_count) a('Number of index records: %d'%self.index_count)
@ -661,19 +649,15 @@ class Tag(object): # {{{
} }
def __init__(self, tagx, vals, entry_type, cncx): def __init__(self, tag_type, vals, cncx):
self.value = vals if len(vals) > 1 else vals[0] if vals else None self.value = vals if len(vals) > 1 else vals[0] if vals else None
self.entry_type = entry_type
tag_type = tagx.tag
self.cncx_value = None self.cncx_value = None
if tag_type in self.TAG_MAP: if tag_type in self.TAG_MAP:
self.attr, self.desc = self.TAG_MAP[tag_type] self.attr, self.desc = self.TAG_MAP[tag_type]
else: else:
print ('Unknown tag value: %d in entry type: %s'%(tag_type, print ('Unknown tag value: %%s'%tag_type)
entry_type)) self.desc = '??Unknown (tag value: %d)'%tag_type
self.desc = '??Unknown (tag value: %d type: %s)'%(
tag_type, entry_type)
self.attr = 'unknown' self.attr = 'unknown'
if '_offset' in self.attr: if '_offset' in self.attr:
@ -695,50 +679,13 @@ class IndexEntry(object): # {{{
used in the navigation UI. used in the navigation UI.
''' '''
def __init__(self, ident, entry_type, raw, cncx, tagx_entries, def __init__(self, ident, entry, cncx):
control_byte_count): try:
self.index = int(ident, 16)
except ValueError:
self.index = ident self.index = ident
self.raw = raw self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in
self.tags = [] entry.iteritems()]
self.entry_type = entry_type
self.byte_size = len(raw)
orig_raw = raw
if control_byte_count not in (1, 2):
raise ValueError('Unknown control byte count: %d'%
control_byte_count)
self.flags = 0
if control_byte_count == 2:
self.flags = ord(raw[0])
raw = raw[1:]
expected_tags = [tag for tag in tagx_entries if tag.bitmask &
entry_type]
flags = self.flags
for tag in expected_tags:
vals = []
if tag.tag > 0b1000000: # 0b1000000 = 64
has_tag = flags & 0b1
flags = flags >> 1
if not has_tag: continue
for i in range(tag.num_values):
if not raw:
raise ValueError('Index entry does not match TAGX header')
val, consumed = decint(raw)
raw = raw[consumed:]
vals.append(val)
self.tags.append(Tag(tag, vals, self.entry_type, cncx))
self.consumed = len(orig_raw) - len(raw)
self.trailing_bytes = raw
if self.trailing_bytes.replace(b'\0', b''):
raise ValueError('%s has leftover bytes: %s'%(self, format_bytes(
self.trailing_bytes)))
@property @property
def label(self): def label(self):
@ -797,102 +744,14 @@ class IndexEntry(object): # {{{
return [0, 0] return [0, 0]
def __str__(self): def __str__(self):
ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, ' ans = ['Index Entry(index=%s, length=%d)'%(
'length=%d, byte_size=%d)'%( self.index, len(self.tags))]
self.index, bin(self.entry_type), bin(self.flags)[2:],
len(self.tags), self.byte_size)]
for tag in self.tags: for tag in self.tags:
if tag.value is not None: if tag.value is not None:
ans.append('\t'+str(tag)) ans.append('\t'+str(tag))
if self.first_child_index != -1: if self.first_child_index != -1:
ans.append('\tNumber of children: %d'%(self.last_child_index - ans.append('\tNumber of children: %d'%(self.last_child_index -
self.first_child_index + 1)) self.first_child_index + 1))
if self.trailing_bytes:
ans.append('\tTrailing bytes: %r'%self.trailing_bytes)
return '\n'.join(ans)
# }}}
class SecondaryIndexRecord(object): # {{{
def __init__(self, record, index_header, cncx):
self.record = record
raw = self.record.raw
if raw[:4] != b'INDX':
raise ValueError('Invalid Primary Index Record')
u = struct.unpack
self.header_length, = u('>I', raw[4:8])
self.unknown1 = raw[8:12]
self.header_type, = u('>I', raw[12:16])
self.unknown2 = raw[16:20]
self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
if self.idxt_offset < 192:
raise ValueError('Unknown Index record structure')
self.unknown3 = raw[28:36]
self.unknown4 = raw[36:192] # Should be 156 bytes
self.index_offsets = []
indices = raw[self.idxt_offset:]
if indices[:4] != b'IDXT':
raise ValueError("Invalid IDXT index table")
indices = indices[4:]
for i in range(self.idxt_count):
off, = u(b'>H', indices[i*2:(i+1)*2])
self.index_offsets.append(off-192)
rest = indices[(i+1)*2:]
if rest.replace(b'\0', ''): # There can be padding null bytes
raise ValueError('Extra bytes after IDXT table: %r'%rest)
indxt = raw[192:self.idxt_offset]
self.size_of_indxt_block = len(indxt)
self.indices = []
for i, off in enumerate(self.index_offsets):
try:
next_off = self.index_offsets[i+1]
except:
next_off = len(indxt)
num = ord(indxt[off])
index = indxt[off+1:off+1+num]
consumed = 1 + num
entry_type = ord(indxt[off+consumed])
pos = off+consumed+1
idxe = IndexEntry(index, entry_type,
indxt[pos:next_off], cncx,
index_header.tagx_entries,
index_header.tagx_control_byte_count)
self.indices.append(idxe)
rest = indxt[pos+self.indices[-1].consumed:]
if rest.replace(b'\0', b''): # There can be padding null bytes
raise ValueError('Extra bytes after IDXT table: %r'%rest)
def __str__(self):
ans = ['*'*20 + ' Secondary Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
a = ans.append
def u(w):
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
len(w), not bool(w.replace(b'\0', b'')) ))
a('Header length: %d'%self.header_length)
u(self.unknown1)
a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
u(self.unknown2)
a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
self.idxt_offset))
a('IDXT Count: %d'%self.idxt_count)
u(self.unknown3)
u(self.unknown4)
a('Index offsets: %r'%self.index_offsets)
a('\nIndex Entries (%d entries):'%len(self.indices))
for entry in self.indices:
a(str(entry))
a('')
return '\n'.join(ans) return '\n'.join(ans)
# }}} # }}}
@ -904,58 +763,25 @@ class IndexRecord(object): # {{{
in the trailing data of the text records. in the trailing data of the text records.
''' '''
def __init__(self, record, index_header, cncx): def __init__(self, records, index_header, cncx):
self.record = record
self.alltext = None self.alltext = None
raw = self.record.raw table = OrderedDict()
tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in
index_header.tagx_entries]
for record in records:
raw = record.raw
if raw[:4] != b'INDX': if raw[:4] != b'INDX':
raise ValueError('Invalid Primary Index Record') raise ValueError('Invalid Primary Index Record')
u = struct.unpack parse_index_record(table, record.raw,
index_header.tagx_control_byte_count, tags,
index_header.index_encoding, strict=True)
self.header_length, = u('>I', raw[4:8])
self.unknown1 = raw[8:12]
self.header_type, = u('>I', raw[12:16])
self.unknown2 = raw[16:20]
self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
if self.idxt_offset < 192:
raise ValueError('Unknown Index record structure')
self.unknown3 = raw[28:36]
self.unknown4 = raw[36:192] # Should be 156 bytes
self.index_offsets = []
indices = raw[self.idxt_offset:]
if indices[:4] != b'IDXT':
raise ValueError("Invalid IDXT index table")
indices = indices[4:]
for i in range(self.idxt_count):
off, = u(b'>H', indices[i*2:(i+1)*2])
self.index_offsets.append(off-192)
rest = indices[(i+1)*2:]
if rest.replace(b'\0', ''): # There can be padding null bytes
raise ValueError('Extra bytes after IDXT table: %r'%rest)
indxt = raw[192:self.idxt_offset]
self.size_of_indxt_block = len(indxt)
self.indices = [] self.indices = []
for i, off in enumerate(self.index_offsets):
try:
next_off = self.index_offsets[i+1]
except:
next_off = len(indxt)
index, consumed = decode_hex_number(indxt[off:])
entry_type = ord(indxt[off+consumed])
pos = off+consumed+1
idxe = IndexEntry(index, entry_type,
indxt[pos:next_off], cncx,
index_header.tagx_entries,
index_header.tagx_control_byte_count)
self.indices.append(idxe)
rest = indxt[pos+self.indices[-1].consumed:] for ident, entry in table.iteritems():
if rest.replace(b'\0', b''): # There can be padding null bytes self.indices.append(IndexEntry(ident, entry, cncx))
raise ValueError('Extra bytes after IDXT table: %r'%rest)
def get_parent(self, index): def get_parent(self, index):
if index.depth < 1: if index.depth < 1:
@ -965,24 +791,12 @@ class IndexRecord(object): # {{{
if p.depth != parent_depth: if p.depth != parent_depth:
continue continue
def __str__(self): def __str__(self):
ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20]
a = ans.append a = ans.append
def u(w): def u(w):
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
len(w), not bool(w.replace(b'\0', b'')) )) len(w), not bool(w.replace(b'\0', b'')) ))
a('Header length: %d'%self.header_length)
u(self.unknown1)
a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
u(self.unknown2)
a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
self.idxt_offset))
a('IDXT Count: %d'%self.idxt_count)
u(self.unknown3)
u(self.unknown4)
a('Index offsets: %r'%self.index_offsets)
a('\nIndex Entries (%d entries):'%len(self.indices))
for entry in self.indices: for entry in self.indices:
offset = entry.offset offset = entry.offset
a(str(entry)) a(str(entry))
@ -1157,7 +971,7 @@ class TBSIndexing(object): # {{{
def get_index(self, idx): def get_index(self, idx):
for i in self.indices: for i in self.indices:
if i.index == idx: return i if i.index in {idx, unicode(idx)}: return i
raise IndexError('Index %d not found'%idx) raise IndexError('Index %d not found'%idx)
def __str__(self): def __str__(self):
@ -1190,7 +1004,7 @@ class TBSIndexing(object): # {{{
if entries: if entries:
ans.append('\t%s:'%typ) ans.append('\t%s:'%typ)
for x in entries: for x in entries:
ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' ans.append(('\t\tIndex Entry: %s (Parent index: %s, '
'Depth: %d, Offset: %d, Size: %d) [%s]')%( 'Depth: %d, Offset: %d, Size: %d) [%s]')%(
x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
def bin4(num): def bin4(num):
@ -1287,18 +1101,18 @@ class TBSIndexing(object): # {{{
' when reading starting section'%extra) ' when reading starting section'%extra)
si = self.get_index(si) si = self.get_index(si)
ans.append('The section at the start of this record is:' ans.append('The section at the start of this record is:'
' %d'%si.index) ' %s'%si.index)
if 0b0100 in extra: if 0b0100 in extra:
num = extra[0b0100] num = extra[0b0100]
ans.append('The number of articles from the section %d' ans.append('The number of articles from the section %d'
' in this record: %d'%(si.index, num)) ' in this record: %s'%(si.index, num))
elif 0b0001 in extra: elif 0b0001 in extra:
eof = extra[0b0001] eof = extra[0b0001]
if eof != 0: if eof != 0:
raise ValueError('Unknown eof value %s when reading' raise ValueError('Unknown eof value %s when reading'
' starting section. All bytes: %r'%(eof, orig)) ' starting section. All bytes: %r'%(eof, orig))
ans.append('??This record has more than one article from ' ans.append('??This record has more than one article from '
' the section: %d'%si.index) ' the section: %s'%si.index)
return si, byts return si, byts
# }}} # }}}
@ -1362,21 +1176,23 @@ class MOBIFile(object): # {{{
pir = self.mobi_header.primary_index_record pir = self.mobi_header.primary_index_record
if pir != NULL_INDEX: if pir != NULL_INDEX:
self.index_header = IndexHeader(self.records[pir]) self.index_header = IndexHeader(self.records[pir])
numi = self.index_header.index_count
self.cncx = CNCX(self.records[ self.cncx = CNCX(self.records[
pir+2:pir+2+self.index_header.num_of_cncx_blocks], pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks],
self.index_header.index_encoding) self.index_header.index_encoding)
self.index_record = IndexRecord(self.records[pir+1], self.index_record = IndexRecord(self.records[pir+1:pir+1+numi],
self.index_header, self.cncx) self.index_header, self.cncx)
self.indexing_record_nums = set(xrange(pir, self.indexing_record_nums = set(xrange(pir,
pir+2+self.index_header.num_of_cncx_blocks)) pir+1+numi+self.index_header.num_of_cncx_blocks))
self.secondary_index_record = self.secondary_index_header = None self.secondary_index_record = self.secondary_index_header = None
sir = self.mobi_header.secondary_index_record sir = self.mobi_header.secondary_index_record
if sir != NULL_INDEX: if sir != NULL_INDEX:
self.secondary_index_header = SecondaryIndexHeader(self.records[sir]) self.secondary_index_header = SecondaryIndexHeader(self.records[sir])
numi = self.secondary_index_header.index_count
self.indexing_record_nums.add(sir) self.indexing_record_nums.add(sir)
self.secondary_index_record = SecondaryIndexRecord( self.secondary_index_record = IndexRecord(
self.records[sir+1], self.secondary_index_header, self.cncx) self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx)
self.indexing_record_nums.add(sir+1) self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi))
ntr = self.mobi_header.number_of_text_records ntr = self.mobi_header.number_of_text_records

View File

@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct import struct
from collections import OrderedDict from collections import OrderedDict, namedtuple
from calibre.ebooks.mobi.utils import decint, count_set_bits from calibre.ebooks.mobi.utils import (decint, count_set_bits,
decode_string)
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
class InvalidFile(ValueError): class InvalidFile(ValueError):
pass pass
@ -37,9 +41,8 @@ def parse_indx_header(data):
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) )
num = len(words) num = len(words)
values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)]) values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
header = {words[i]:values[i] for i in xrange(num)} return dict(zip(words, values))
return header
class CNCX(object): # {{{ class CNCX(object): # {{{
@ -77,101 +80,94 @@ class CNCX(object): # {{{
return self.records.get(offset, default) return self.records.get(offset, default)
# }}} # }}}
def parse_tag_section(data): def parse_tagx_section(data):
check_signature(data, b'TAGX') check_signature(data, b'TAGX')
tags = [] tags = []
first_entry_offset, = struct.unpack_from(b'>L', data, 0x04) first_entry_offset, = struct.unpack_from(b'>L', data, 4)
control_byte_count, = struct.unpack_from(b'>L', data, 0x08) control_byte_count, = struct.unpack_from(b'>L', data, 8)
# Skip the first 12 bytes already read above.
for i in xrange(12, first_entry_offset, 4): for i in xrange(12, first_entry_offset, 4):
pos = i vals = list(bytearray(data[i:i+4]))
tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]), tags.append(TagX(*vals))
ord(data[pos+3])))
return control_byte_count, tags return control_byte_count, tags
def get_tag_map(control_byte_count, tags, data, start, end): def get_tag_map(control_byte_count, tagx, data, strict=False):
ptags = [] ptags = []
ans = {} ans = {}
control_byte_index = 0 control_bytes = list(bytearray(data[:control_byte_count]))
data_start = start + control_byte_count data = data[control_byte_count:]
for tag, values_per_entry, mask, end_flag in tags: for x in tagx:
if end_flag == 0x01: if x.eof == 0x01:
control_byte_index += 1 control_bytes = control_bytes[1:]
continue continue
value = ord(data[start + control_byte_index]) & mask value = control_bytes[0] & x.bitmask
if value != 0: if value != 0:
if value == mask: value_count = value_bytes = None
if count_set_bits(mask) > 1: if value == x.bitmask:
if count_set_bits(x.bitmask) > 1:
# If all bits of masked value are set and the mask has more # If all bits of masked value are set and the mask has more
# than one bit, a variable width value will follow after # than one bit, a variable width value will follow after
# the control bytes which defines the length of bytes (NOT # the control bytes which defines the length of bytes (NOT
# the value count!) which will contain the corresponding # the value count!) which will contain the corresponding
# variable width values. # variable width values.
value, consumed = decint(data[data_start:]) value_bytes, consumed = decint(data)
data_start += consumed data = data[consumed:]
ptags.append((tag, None, value, values_per_entry))
else: else:
ptags.append((tag, 1, None, values_per_entry)) value_count = 1
else: else:
# Shift bits to get the masked value. # Shift bits to get the masked value.
while mask & 0x01 == 0: mask = x.bitmask
mask = mask >> 1 while mask & 0b1 == 0:
value = value >> 1 mask >>= 1
ptags.append((tag, value, None, values_per_entry)) value >>= 1
for tag, value_count, value_bytes, values_per_entry in ptags: value_count = value
ptags.append(PTagX(x.tag, value_count, value_bytes,
x.num_of_values))
for x in ptags:
values = [] values = []
if value_count != None: if x.value_count is not None:
# Read value_count * values_per_entry variable width values. # Read value_count * values_per_entry variable width values.
for _ in xrange(value_count*values_per_entry): for _ in xrange(x.value_count * x.num_of_values):
byts, consumed = decint(data[data_start:]) byts, consumed = decint(data)
data_start += consumed data = data[consumed:]
values.append(byts) values.append(byts)
else: else: # value_bytes is not None
# Convert value_bytes to variable width values. # Convert value_bytes to variable width values.
total_consumed = 0 total_consumed = 0
while total_consumed < value_bytes: while total_consumed < x.value_bytes:
# Does this work for values_per_entry != 1? # Does this work for values_per_entry != 1?
byts, consumed = decint(data[data_start:]) byts, consumed = decint(data)
data_start += consumed data = data[consumed:]
total_consumed += consumed total_consumed += consumed
values.append(byts) values.append(byts)
if total_consumed != value_bytes: if total_consumed != x.value_bytes:
print ("Error: Should consume %s bytes, but consumed %s" % err = ("Error: Should consume %s bytes, but consumed %s" %
(value_bytes, total_consumed)) (x.value_bytes, total_consumed))
ans[tag] = values if strict:
# Test that all bytes have been processed if end is given. raise ValueError(err)
if end is not None and data_start < end: else:
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left. print(err)
rest = data[data_start:end] ans[x.tag] = values
if rest.replace(b'\0', b''): # Test that all bytes have been processed
print ("Warning: There are unprocessed index bytes left: %s" % if data.replace(b'\0', b''):
format_bytes(rest)) err = ("Warning: There are unprocessed index bytes left: %s" %
format_bytes(data))
if strict:
raise ValueError(err)
else:
print(err)
return ans return ans
def read_index(sections, idx, codec): def parse_index_record(table, data, control_byte_count, tags, codec,
table, cncx = OrderedDict(), CNCX([], codec) strict=False):
data = sections[idx][0]
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['len']
control_byte_count, tags = parse_tag_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
data = sections[i][0]
header = parse_indx_header(data) header = parse_indx_header(data)
idxt_pos = header['start'] idxt_pos = header['start']
if data[idxt_pos:idxt_pos+4] != b'IDXT':
print ('WARNING: Invalid INDX record')
entry_count = header['count'] entry_count = header['count']
# loop through to build up the IDXT position starts # loop through to build up the IDXT position starts
@ -187,11 +183,32 @@ def read_index(sections, idx, codec):
# text # text
for j in xrange(entry_count): for j in xrange(entry_count):
start, end = idx_positions[j:j+2] start, end = idx_positions[j:j+2]
text_length = ord(data[start]) rec = data[start:end]
text = data[start+1:start+1+text_length] ident, consumed = decode_string(rec, codec=codec)
tag_map = get_tag_map(control_byte_count, tags, data, rec = rec[consumed:]
start+1+text_length, end) tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
table[text] = tag_map table[ident] = tag_map
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
data = sections[idx][0]
indx_header = parse_indx_header(data)
indx_count = indx_header['count']
if indx_header['ncncx'] > 0:
off = idx + indx_count + 1
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['len']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
# Index record
data = sections[i][0]
parse_index_record(table, data, control_byte_count, tags, codec)
return table, cncx return table, cncx

View File

@ -317,6 +317,7 @@ class Mobi8Reader(object):
for entry in index_entries: for entry in index_entries:
pos = entry['pos'] pos = entry['pos']
fi = self.get_file_info(pos) fi = self.get_file_info(pos)
#print (11111111, fi, entry['pos_fid'])
if fi.filename is None: if fi.filename is None:
raise ValueError('Index entry has invalid pos: %d'%pos) raise ValueError('Index entry has invalid pos: %d'%pos)
idtag = self.get_id_tag(pos).decode(self.header.codec) idtag = self.get_id_tag(pos).decode(self.header.codec)

View File

@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index from calibre.ebooks.mobi.reader.index import read_index
@ -23,7 +22,30 @@ tag_fieldname_map = {
6: ['pos_fid',0], 6: ['pos_fid',0],
21: ['parent',0], 21: ['parent',0],
22: ['child1',0], 22: ['child1',0],
23: ['childn',0] 23: ['childn',0],
69: ['image_index',0],
70 : ['desc_offset', 0], # 'Description offset in cncx'
71 : ['author_offset', 0], # 'Author offset in cncx'
72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
}
default_entry = {
'pos': -1,
'len': 0,
'noffs': -1,
'text' : "Unknown Text",
'hlvl' : -1,
'kind' : "Unknown Class",
'pos_fid' : None,
'parent' : -1,
'child1' : -1,
'childn' : -1,
'description': None,
'author': None,
'image_caption': None,
'image_attribution': None,
} }
def read_ncx(sections, index, codec): def read_ncx(sections, index, codec):
@ -34,32 +56,25 @@ def read_ncx(sections, index, codec):
for num, x in enumerate(table.iteritems()): for num, x in enumerate(table.iteritems()):
text, tag_map = x text, tag_map = x
entry = { entry = default_entry.copy()
'name': text, entry['name'] = text
'pos': -1, entry['num'] = num
'len': 0,
'noffs': -1,
'text' : "Unknown Text",
'hlvl' : -1,
'kind' : "Unknown Kind",
'pos_fid' : None,
'parent' : -1,
'child1' : -1,
'childn' : -1,
'num' : num
}
for tag in tag_fieldname_map.keys(): for tag in tag_fieldname_map.iterkeys():
fieldname, i = tag_fieldname_map[tag] fieldname, i = tag_fieldname_map[tag]
if tag in tag_map: if tag in tag_map:
fieldvalue = tag_map[tag][i] fieldvalue = tag_map[tag][i]
if tag == 6: if tag == 6:
fieldvalue = to_base(fieldvalue, base=32) # Appears to be an idx into the KF8 elems table with an
# offset
fieldvalue = tuple(tag_map[tag])
entry[fieldname] = fieldvalue entry[fieldname] = fieldvalue
if tag == 3: for which, name in {3:'text', 5:'kind', 70:'description',
entry['text'] = cncx.get(fieldvalue, 'Unknown Text') 71:'author', 72:'image_caption',
if tag == 5: 73:'image_attribution'}.iteritems():
entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind') if tag == which:
entry[name] = cncx.get(fieldvalue,
default_entry[name])
index_entries.append(entry) index_entries.append(entry)
return index_entries return index_entries

View File

@ -15,7 +15,13 @@ from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024 IMAGE_MAX_SIZE = 10 * 1024 * 1024
def decode_hex_number(raw): def decode_string(raw, codec='utf-8'):
length, = struct.unpack(b'>B', raw[0])
raw = raw[1:1+length]
consumed = length+1
return raw.decode(codec), consumed
def decode_hex_number(raw, codec='utf-8'):
''' '''
Return a variable length number encoded using hexadecimal encoding. These Return a variable length number encoded using hexadecimal encoding. These
numbers have the first byte which tells the number of bytes that follow. numbers have the first byte which tells the number of bytes that follow.
@ -25,13 +31,16 @@ def decode_hex_number(raw):
:param raw: Raw binary data as a bytestring :param raw: Raw binary data as a bytestring
:return: The number and the number of bytes from raw that the number :return: The number and the number of bytes from raw that the number
occupies occupies.
''' '''
length, = struct.unpack(b'>B', raw[0]) raw, consumed = decode_string(raw, codec=codec)
raw = raw[1:1+length]
consumed = length+1
return int(raw, 16), consumed return int(raw, 16), consumed
def encode_string(raw):
ans = bytearray(bytes(raw))
ans.insert(0, len(ans))
return bytes(ans)
def encode_number_as_hex(num): def encode_number_as_hex(num):
''' '''
Encode num as a variable length encoded hexadecimal number. Returns the Encode num as a variable length encoded hexadecimal number. Returns the
@ -44,9 +53,7 @@ def encode_number_as_hex(num):
nlen = len(num) nlen = len(num)
if nlen % 2 != 0: if nlen % 2 != 0:
num = b'0'+num num = b'0'+num
ans = bytearray(num) return encode_string(num)
ans.insert(0, len(num))
return bytes(ans)
def encint(value, forward=True): def encint(value, forward=True):
''' '''