mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor inspect MOBI to use the INDX reading code from mobi.reader
This commit is contained in:
parent
91a4bd7d42
commit
c87ad6d69f
@ -15,6 +15,8 @@ from lxml import html
|
|||||||
from calibre.utils.date import utc_tz
|
from calibre.utils.date import utc_tz
|
||||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
||||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||||
|
from calibre.ebooks.mobi.reader.index import (parse_index_record,
|
||||||
|
parse_tagx_section)
|
||||||
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
||||||
get_trailing_data, decode_tbs, read_font_record)
|
get_trailing_data, decode_tbs, read_font_record)
|
||||||
from calibre.utils.magick.draw import identify_data
|
from calibre.utils.magick.draw import identify_data
|
||||||
@ -405,14 +407,10 @@ class MOBIHeader(object): # {{{
|
|||||||
|
|
||||||
class TagX(object): # {{{
|
class TagX(object): # {{{
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, tag, num_values, bitmask, eof):
|
||||||
self.tag = ord(raw[0])
|
self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values,
|
||||||
self.num_values = ord(raw[1])
|
bitmask, eof)
|
||||||
self.bitmask = ord(raw[2])
|
self.num_of_values = num_values
|
||||||
# End of file = 1 iff last entry
|
|
||||||
# When it is 1 all others are 0
|
|
||||||
self.eof = ord(raw[3])
|
|
||||||
|
|
||||||
self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
|
self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0
|
||||||
and self.bitmask == 0)
|
and self.bitmask == 0)
|
||||||
|
|
||||||
@ -459,13 +457,7 @@ class SecondaryIndexHeader(object): # {{{
|
|||||||
raise ValueError('Invalid TAGX section')
|
raise ValueError('Invalid TAGX section')
|
||||||
self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
|
self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
|
||||||
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
|
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
|
||||||
tag_table = tagx[12:self.tagx_header_length]
|
self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
|
||||||
if len(tag_table) % 4 != 0:
|
|
||||||
raise ValueError('Invalid Tag table')
|
|
||||||
num_tagx_entries = len(tag_table) // 4
|
|
||||||
self.tagx_entries = []
|
|
||||||
for i in range(num_tagx_entries):
|
|
||||||
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
|
|
||||||
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
||||||
raise ValueError('TAGX last entry is not EOF')
|
raise ValueError('TAGX last entry is not EOF')
|
||||||
|
|
||||||
@ -533,7 +525,8 @@ class IndexHeader(object): # {{{
|
|||||||
raise ValueError('Invalid Primary Index Record')
|
raise ValueError('Invalid Primary Index Record')
|
||||||
|
|
||||||
self.header_length, = struct.unpack('>I', raw[4:8])
|
self.header_length, = struct.unpack('>I', raw[4:8])
|
||||||
self.unknown1 = raw[8:16]
|
self.unknown1 = raw[8:12]
|
||||||
|
self.header_type, = struct.unpack('>I', raw[12:16])
|
||||||
self.index_type, = struct.unpack('>I', raw[16:20])
|
self.index_type, = struct.unpack('>I', raw[16:20])
|
||||||
self.index_type_desc = {0: 'normal', 2:
|
self.index_type_desc = {0: 'normal', 2:
|
||||||
'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
|
'inflection', 6: 'calibre'}.get(self.index_type, 'unknown')
|
||||||
@ -562,13 +555,7 @@ class IndexHeader(object): # {{{
|
|||||||
raise ValueError('Invalid TAGX section')
|
raise ValueError('Invalid TAGX section')
|
||||||
self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
|
self.tagx_header_length, = struct.unpack('>I', tagx[4:8])
|
||||||
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
|
self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12])
|
||||||
tag_table = tagx[12:self.tagx_header_length]
|
self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]]
|
||||||
if len(tag_table) % 4 != 0:
|
|
||||||
raise ValueError('Invalid Tag table')
|
|
||||||
num_tagx_entries = len(tag_table) // 4
|
|
||||||
self.tagx_entries = []
|
|
||||||
for i in range(num_tagx_entries):
|
|
||||||
self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4]))
|
|
||||||
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
if self.tagx_entries and not self.tagx_entries[-1].is_eof:
|
||||||
raise ValueError('TAGX last entry is not EOF')
|
raise ValueError('TAGX last entry is not EOF')
|
||||||
|
|
||||||
@ -602,6 +589,7 @@ class IndexHeader(object): # {{{
|
|||||||
|
|
||||||
a('Header length: %d'%self.header_length)
|
a('Header length: %d'%self.header_length)
|
||||||
u(self.unknown1)
|
u(self.unknown1)
|
||||||
|
a('Header type: %d'%self.header_type)
|
||||||
a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
|
a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
|
||||||
a('Offset to IDXT start: %d'%self.idxt_start)
|
a('Offset to IDXT start: %d'%self.idxt_start)
|
||||||
a('Number of index records: %d'%self.index_count)
|
a('Number of index records: %d'%self.index_count)
|
||||||
@ -661,19 +649,15 @@ class Tag(object): # {{{
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, tagx, vals, entry_type, cncx):
|
def __init__(self, tag_type, vals, cncx):
|
||||||
self.value = vals if len(vals) > 1 else vals[0] if vals else None
|
self.value = vals if len(vals) > 1 else vals[0] if vals else None
|
||||||
self.entry_type = entry_type
|
|
||||||
tag_type = tagx.tag
|
|
||||||
|
|
||||||
self.cncx_value = None
|
self.cncx_value = None
|
||||||
if tag_type in self.TAG_MAP:
|
if tag_type in self.TAG_MAP:
|
||||||
self.attr, self.desc = self.TAG_MAP[tag_type]
|
self.attr, self.desc = self.TAG_MAP[tag_type]
|
||||||
else:
|
else:
|
||||||
print ('Unknown tag value: %d in entry type: %s'%(tag_type,
|
print ('Unknown tag value: %%s'%tag_type)
|
||||||
entry_type))
|
self.desc = '??Unknown (tag value: %d)'%tag_type
|
||||||
self.desc = '??Unknown (tag value: %d type: %s)'%(
|
|
||||||
tag_type, entry_type)
|
|
||||||
self.attr = 'unknown'
|
self.attr = 'unknown'
|
||||||
|
|
||||||
if '_offset' in self.attr:
|
if '_offset' in self.attr:
|
||||||
@ -695,50 +679,13 @@ class IndexEntry(object): # {{{
|
|||||||
used in the navigation UI.
|
used in the navigation UI.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, ident, entry_type, raw, cncx, tagx_entries,
|
def __init__(self, ident, entry, cncx):
|
||||||
control_byte_count):
|
try:
|
||||||
|
self.index = int(ident, 16)
|
||||||
|
except ValueError:
|
||||||
self.index = ident
|
self.index = ident
|
||||||
self.raw = raw
|
self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in
|
||||||
self.tags = []
|
entry.iteritems()]
|
||||||
self.entry_type = entry_type
|
|
||||||
self.byte_size = len(raw)
|
|
||||||
|
|
||||||
orig_raw = raw
|
|
||||||
|
|
||||||
if control_byte_count not in (1, 2):
|
|
||||||
raise ValueError('Unknown control byte count: %d'%
|
|
||||||
control_byte_count)
|
|
||||||
|
|
||||||
self.flags = 0
|
|
||||||
|
|
||||||
if control_byte_count == 2:
|
|
||||||
self.flags = ord(raw[0])
|
|
||||||
raw = raw[1:]
|
|
||||||
|
|
||||||
expected_tags = [tag for tag in tagx_entries if tag.bitmask &
|
|
||||||
entry_type]
|
|
||||||
|
|
||||||
flags = self.flags
|
|
||||||
for tag in expected_tags:
|
|
||||||
vals = []
|
|
||||||
|
|
||||||
if tag.tag > 0b1000000: # 0b1000000 = 64
|
|
||||||
has_tag = flags & 0b1
|
|
||||||
flags = flags >> 1
|
|
||||||
if not has_tag: continue
|
|
||||||
for i in range(tag.num_values):
|
|
||||||
if not raw:
|
|
||||||
raise ValueError('Index entry does not match TAGX header')
|
|
||||||
val, consumed = decint(raw)
|
|
||||||
raw = raw[consumed:]
|
|
||||||
vals.append(val)
|
|
||||||
self.tags.append(Tag(tag, vals, self.entry_type, cncx))
|
|
||||||
|
|
||||||
self.consumed = len(orig_raw) - len(raw)
|
|
||||||
self.trailing_bytes = raw
|
|
||||||
if self.trailing_bytes.replace(b'\0', b''):
|
|
||||||
raise ValueError('%s has leftover bytes: %s'%(self, format_bytes(
|
|
||||||
self.trailing_bytes)))
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label(self):
|
def label(self):
|
||||||
@ -797,102 +744,14 @@ class IndexEntry(object): # {{{
|
|||||||
return [0, 0]
|
return [0, 0]
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, '
|
ans = ['Index Entry(index=%s, length=%d)'%(
|
||||||
'length=%d, byte_size=%d)'%(
|
self.index, len(self.tags))]
|
||||||
self.index, bin(self.entry_type), bin(self.flags)[2:],
|
|
||||||
len(self.tags), self.byte_size)]
|
|
||||||
for tag in self.tags:
|
for tag in self.tags:
|
||||||
if tag.value is not None:
|
if tag.value is not None:
|
||||||
ans.append('\t'+str(tag))
|
ans.append('\t'+str(tag))
|
||||||
if self.first_child_index != -1:
|
if self.first_child_index != -1:
|
||||||
ans.append('\tNumber of children: %d'%(self.last_child_index -
|
ans.append('\tNumber of children: %d'%(self.last_child_index -
|
||||||
self.first_child_index + 1))
|
self.first_child_index + 1))
|
||||||
if self.trailing_bytes:
|
|
||||||
ans.append('\tTrailing bytes: %r'%self.trailing_bytes)
|
|
||||||
return '\n'.join(ans)
|
|
||||||
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
class SecondaryIndexRecord(object): # {{{
|
|
||||||
|
|
||||||
def __init__(self, record, index_header, cncx):
|
|
||||||
self.record = record
|
|
||||||
raw = self.record.raw
|
|
||||||
|
|
||||||
if raw[:4] != b'INDX':
|
|
||||||
raise ValueError('Invalid Primary Index Record')
|
|
||||||
|
|
||||||
u = struct.unpack
|
|
||||||
|
|
||||||
self.header_length, = u('>I', raw[4:8])
|
|
||||||
self.unknown1 = raw[8:12]
|
|
||||||
self.header_type, = u('>I', raw[12:16])
|
|
||||||
self.unknown2 = raw[16:20]
|
|
||||||
self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
|
|
||||||
if self.idxt_offset < 192:
|
|
||||||
raise ValueError('Unknown Index record structure')
|
|
||||||
self.unknown3 = raw[28:36]
|
|
||||||
self.unknown4 = raw[36:192] # Should be 156 bytes
|
|
||||||
|
|
||||||
self.index_offsets = []
|
|
||||||
indices = raw[self.idxt_offset:]
|
|
||||||
if indices[:4] != b'IDXT':
|
|
||||||
raise ValueError("Invalid IDXT index table")
|
|
||||||
indices = indices[4:]
|
|
||||||
for i in range(self.idxt_count):
|
|
||||||
off, = u(b'>H', indices[i*2:(i+1)*2])
|
|
||||||
self.index_offsets.append(off-192)
|
|
||||||
rest = indices[(i+1)*2:]
|
|
||||||
if rest.replace(b'\0', ''): # There can be padding null bytes
|
|
||||||
raise ValueError('Extra bytes after IDXT table: %r'%rest)
|
|
||||||
|
|
||||||
indxt = raw[192:self.idxt_offset]
|
|
||||||
self.size_of_indxt_block = len(indxt)
|
|
||||||
|
|
||||||
self.indices = []
|
|
||||||
for i, off in enumerate(self.index_offsets):
|
|
||||||
try:
|
|
||||||
next_off = self.index_offsets[i+1]
|
|
||||||
except:
|
|
||||||
next_off = len(indxt)
|
|
||||||
num = ord(indxt[off])
|
|
||||||
index = indxt[off+1:off+1+num]
|
|
||||||
consumed = 1 + num
|
|
||||||
entry_type = ord(indxt[off+consumed])
|
|
||||||
pos = off+consumed+1
|
|
||||||
idxe = IndexEntry(index, entry_type,
|
|
||||||
indxt[pos:next_off], cncx,
|
|
||||||
index_header.tagx_entries,
|
|
||||||
index_header.tagx_control_byte_count)
|
|
||||||
self.indices.append(idxe)
|
|
||||||
|
|
||||||
rest = indxt[pos+self.indices[-1].consumed:]
|
|
||||||
if rest.replace(b'\0', b''): # There can be padding null bytes
|
|
||||||
raise ValueError('Extra bytes after IDXT table: %r'%rest)
|
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
ans = ['*'*20 + ' Secondary Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
|
|
||||||
a = ans.append
|
|
||||||
def u(w):
|
|
||||||
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
|
||||||
len(w), not bool(w.replace(b'\0', b'')) ))
|
|
||||||
a('Header length: %d'%self.header_length)
|
|
||||||
u(self.unknown1)
|
|
||||||
a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
|
|
||||||
u(self.unknown2)
|
|
||||||
a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
|
|
||||||
self.idxt_offset))
|
|
||||||
a('IDXT Count: %d'%self.idxt_count)
|
|
||||||
u(self.unknown3)
|
|
||||||
u(self.unknown4)
|
|
||||||
a('Index offsets: %r'%self.index_offsets)
|
|
||||||
a('\nIndex Entries (%d entries):'%len(self.indices))
|
|
||||||
for entry in self.indices:
|
|
||||||
a(str(entry))
|
|
||||||
a('')
|
|
||||||
|
|
||||||
|
|
||||||
return '\n'.join(ans)
|
return '\n'.join(ans)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
@ -904,58 +763,25 @@ class IndexRecord(object): # {{{
|
|||||||
in the trailing data of the text records.
|
in the trailing data of the text records.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, record, index_header, cncx):
|
def __init__(self, records, index_header, cncx):
|
||||||
self.record = record
|
|
||||||
self.alltext = None
|
self.alltext = None
|
||||||
raw = self.record.raw
|
table = OrderedDict()
|
||||||
|
tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in
|
||||||
|
index_header.tagx_entries]
|
||||||
|
for record in records:
|
||||||
|
raw = record.raw
|
||||||
|
|
||||||
if raw[:4] != b'INDX':
|
if raw[:4] != b'INDX':
|
||||||
raise ValueError('Invalid Primary Index Record')
|
raise ValueError('Invalid Primary Index Record')
|
||||||
|
|
||||||
u = struct.unpack
|
parse_index_record(table, record.raw,
|
||||||
|
index_header.tagx_control_byte_count, tags,
|
||||||
|
index_header.index_encoding, strict=True)
|
||||||
|
|
||||||
self.header_length, = u('>I', raw[4:8])
|
|
||||||
self.unknown1 = raw[8:12]
|
|
||||||
self.header_type, = u('>I', raw[12:16])
|
|
||||||
self.unknown2 = raw[16:20]
|
|
||||||
self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
|
|
||||||
if self.idxt_offset < 192:
|
|
||||||
raise ValueError('Unknown Index record structure')
|
|
||||||
self.unknown3 = raw[28:36]
|
|
||||||
self.unknown4 = raw[36:192] # Should be 156 bytes
|
|
||||||
|
|
||||||
self.index_offsets = []
|
|
||||||
indices = raw[self.idxt_offset:]
|
|
||||||
if indices[:4] != b'IDXT':
|
|
||||||
raise ValueError("Invalid IDXT index table")
|
|
||||||
indices = indices[4:]
|
|
||||||
for i in range(self.idxt_count):
|
|
||||||
off, = u(b'>H', indices[i*2:(i+1)*2])
|
|
||||||
self.index_offsets.append(off-192)
|
|
||||||
rest = indices[(i+1)*2:]
|
|
||||||
if rest.replace(b'\0', ''): # There can be padding null bytes
|
|
||||||
raise ValueError('Extra bytes after IDXT table: %r'%rest)
|
|
||||||
|
|
||||||
indxt = raw[192:self.idxt_offset]
|
|
||||||
self.size_of_indxt_block = len(indxt)
|
|
||||||
self.indices = []
|
self.indices = []
|
||||||
for i, off in enumerate(self.index_offsets):
|
|
||||||
try:
|
|
||||||
next_off = self.index_offsets[i+1]
|
|
||||||
except:
|
|
||||||
next_off = len(indxt)
|
|
||||||
index, consumed = decode_hex_number(indxt[off:])
|
|
||||||
entry_type = ord(indxt[off+consumed])
|
|
||||||
pos = off+consumed+1
|
|
||||||
idxe = IndexEntry(index, entry_type,
|
|
||||||
indxt[pos:next_off], cncx,
|
|
||||||
index_header.tagx_entries,
|
|
||||||
index_header.tagx_control_byte_count)
|
|
||||||
self.indices.append(idxe)
|
|
||||||
|
|
||||||
rest = indxt[pos+self.indices[-1].consumed:]
|
for ident, entry in table.iteritems():
|
||||||
if rest.replace(b'\0', b''): # There can be padding null bytes
|
self.indices.append(IndexEntry(ident, entry, cncx))
|
||||||
raise ValueError('Extra bytes after IDXT table: %r'%rest)
|
|
||||||
|
|
||||||
def get_parent(self, index):
|
def get_parent(self, index):
|
||||||
if index.depth < 1:
|
if index.depth < 1:
|
||||||
@ -965,24 +791,12 @@ class IndexRecord(object): # {{{
|
|||||||
if p.depth != parent_depth:
|
if p.depth != parent_depth:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20]
|
ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20]
|
||||||
a = ans.append
|
a = ans.append
|
||||||
def u(w):
|
def u(w):
|
||||||
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
|
||||||
len(w), not bool(w.replace(b'\0', b'')) ))
|
len(w), not bool(w.replace(b'\0', b'')) ))
|
||||||
a('Header length: %d'%self.header_length)
|
|
||||||
u(self.unknown1)
|
|
||||||
a('Unknown (header type? index record number? always 1?): %d'%self.header_type)
|
|
||||||
u(self.unknown2)
|
|
||||||
a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block,
|
|
||||||
self.idxt_offset))
|
|
||||||
a('IDXT Count: %d'%self.idxt_count)
|
|
||||||
u(self.unknown3)
|
|
||||||
u(self.unknown4)
|
|
||||||
a('Index offsets: %r'%self.index_offsets)
|
|
||||||
a('\nIndex Entries (%d entries):'%len(self.indices))
|
|
||||||
for entry in self.indices:
|
for entry in self.indices:
|
||||||
offset = entry.offset
|
offset = entry.offset
|
||||||
a(str(entry))
|
a(str(entry))
|
||||||
@ -1157,7 +971,7 @@ class TBSIndexing(object): # {{{
|
|||||||
|
|
||||||
def get_index(self, idx):
|
def get_index(self, idx):
|
||||||
for i in self.indices:
|
for i in self.indices:
|
||||||
if i.index == idx: return i
|
if i.index in {idx, unicode(idx)}: return i
|
||||||
raise IndexError('Index %d not found'%idx)
|
raise IndexError('Index %d not found'%idx)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
@ -1190,7 +1004,7 @@ class TBSIndexing(object): # {{{
|
|||||||
if entries:
|
if entries:
|
||||||
ans.append('\t%s:'%typ)
|
ans.append('\t%s:'%typ)
|
||||||
for x in entries:
|
for x in entries:
|
||||||
ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
|
ans.append(('\t\tIndex Entry: %s (Parent index: %s, '
|
||||||
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
|
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
|
||||||
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
|
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
|
||||||
def bin4(num):
|
def bin4(num):
|
||||||
@ -1287,18 +1101,18 @@ class TBSIndexing(object): # {{{
|
|||||||
' when reading starting section'%extra)
|
' when reading starting section'%extra)
|
||||||
si = self.get_index(si)
|
si = self.get_index(si)
|
||||||
ans.append('The section at the start of this record is:'
|
ans.append('The section at the start of this record is:'
|
||||||
' %d'%si.index)
|
' %s'%si.index)
|
||||||
if 0b0100 in extra:
|
if 0b0100 in extra:
|
||||||
num = extra[0b0100]
|
num = extra[0b0100]
|
||||||
ans.append('The number of articles from the section %d'
|
ans.append('The number of articles from the section %d'
|
||||||
' in this record: %d'%(si.index, num))
|
' in this record: %s'%(si.index, num))
|
||||||
elif 0b0001 in extra:
|
elif 0b0001 in extra:
|
||||||
eof = extra[0b0001]
|
eof = extra[0b0001]
|
||||||
if eof != 0:
|
if eof != 0:
|
||||||
raise ValueError('Unknown eof value %s when reading'
|
raise ValueError('Unknown eof value %s when reading'
|
||||||
' starting section. All bytes: %r'%(eof, orig))
|
' starting section. All bytes: %r'%(eof, orig))
|
||||||
ans.append('??This record has more than one article from '
|
ans.append('??This record has more than one article from '
|
||||||
' the section: %d'%si.index)
|
' the section: %s'%si.index)
|
||||||
return si, byts
|
return si, byts
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -1362,21 +1176,23 @@ class MOBIFile(object): # {{{
|
|||||||
pir = self.mobi_header.primary_index_record
|
pir = self.mobi_header.primary_index_record
|
||||||
if pir != NULL_INDEX:
|
if pir != NULL_INDEX:
|
||||||
self.index_header = IndexHeader(self.records[pir])
|
self.index_header = IndexHeader(self.records[pir])
|
||||||
|
numi = self.index_header.index_count
|
||||||
self.cncx = CNCX(self.records[
|
self.cncx = CNCX(self.records[
|
||||||
pir+2:pir+2+self.index_header.num_of_cncx_blocks],
|
pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks],
|
||||||
self.index_header.index_encoding)
|
self.index_header.index_encoding)
|
||||||
self.index_record = IndexRecord(self.records[pir+1],
|
self.index_record = IndexRecord(self.records[pir+1:pir+1+numi],
|
||||||
self.index_header, self.cncx)
|
self.index_header, self.cncx)
|
||||||
self.indexing_record_nums = set(xrange(pir,
|
self.indexing_record_nums = set(xrange(pir,
|
||||||
pir+2+self.index_header.num_of_cncx_blocks))
|
pir+1+numi+self.index_header.num_of_cncx_blocks))
|
||||||
self.secondary_index_record = self.secondary_index_header = None
|
self.secondary_index_record = self.secondary_index_header = None
|
||||||
sir = self.mobi_header.secondary_index_record
|
sir = self.mobi_header.secondary_index_record
|
||||||
if sir != NULL_INDEX:
|
if sir != NULL_INDEX:
|
||||||
self.secondary_index_header = SecondaryIndexHeader(self.records[sir])
|
self.secondary_index_header = SecondaryIndexHeader(self.records[sir])
|
||||||
|
numi = self.secondary_index_header.index_count
|
||||||
self.indexing_record_nums.add(sir)
|
self.indexing_record_nums.add(sir)
|
||||||
self.secondary_index_record = SecondaryIndexRecord(
|
self.secondary_index_record = IndexRecord(
|
||||||
self.records[sir+1], self.secondary_index_header, self.cncx)
|
self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx)
|
||||||
self.indexing_record_nums.add(sir+1)
|
self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi))
|
||||||
|
|
||||||
|
|
||||||
ntr = self.mobi_header.number_of_text_records
|
ntr = self.mobi_header.number_of_text_records
|
||||||
|
@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import struct
|
import struct
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict, namedtuple
|
||||||
|
|
||||||
from calibre.ebooks.mobi.utils import decint, count_set_bits
|
from calibre.ebooks.mobi.utils import (decint, count_set_bits,
|
||||||
|
decode_string)
|
||||||
|
|
||||||
|
TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
|
||||||
|
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
|
||||||
|
|
||||||
class InvalidFile(ValueError):
|
class InvalidFile(ValueError):
|
||||||
pass
|
pass
|
||||||
@ -37,9 +41,8 @@ def parse_indx_header(data):
|
|||||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||||
)
|
)
|
||||||
num = len(words)
|
num = len(words)
|
||||||
values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)])
|
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
||||||
header = {words[i]:values[i] for i in xrange(num)}
|
return dict(zip(words, values))
|
||||||
return header
|
|
||||||
|
|
||||||
class CNCX(object): # {{{
|
class CNCX(object): # {{{
|
||||||
|
|
||||||
@ -77,101 +80,94 @@ class CNCX(object): # {{{
|
|||||||
return self.records.get(offset, default)
|
return self.records.get(offset, default)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def parse_tag_section(data):
|
def parse_tagx_section(data):
|
||||||
check_signature(data, b'TAGX')
|
check_signature(data, b'TAGX')
|
||||||
|
|
||||||
tags = []
|
tags = []
|
||||||
first_entry_offset, = struct.unpack_from(b'>L', data, 0x04)
|
first_entry_offset, = struct.unpack_from(b'>L', data, 4)
|
||||||
control_byte_count, = struct.unpack_from(b'>L', data, 0x08)
|
control_byte_count, = struct.unpack_from(b'>L', data, 8)
|
||||||
|
|
||||||
# Skip the first 12 bytes already read above.
|
|
||||||
for i in xrange(12, first_entry_offset, 4):
|
for i in xrange(12, first_entry_offset, 4):
|
||||||
pos = i
|
vals = list(bytearray(data[i:i+4]))
|
||||||
tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]),
|
tags.append(TagX(*vals))
|
||||||
ord(data[pos+3])))
|
|
||||||
return control_byte_count, tags
|
return control_byte_count, tags
|
||||||
|
|
||||||
def get_tag_map(control_byte_count, tags, data, start, end):
|
def get_tag_map(control_byte_count, tagx, data, strict=False):
|
||||||
ptags = []
|
ptags = []
|
||||||
ans = {}
|
ans = {}
|
||||||
control_byte_index = 0
|
control_bytes = list(bytearray(data[:control_byte_count]))
|
||||||
data_start = start + control_byte_count
|
data = data[control_byte_count:]
|
||||||
|
|
||||||
for tag, values_per_entry, mask, end_flag in tags:
|
for x in tagx:
|
||||||
if end_flag == 0x01:
|
if x.eof == 0x01:
|
||||||
control_byte_index += 1
|
control_bytes = control_bytes[1:]
|
||||||
continue
|
continue
|
||||||
value = ord(data[start + control_byte_index]) & mask
|
value = control_bytes[0] & x.bitmask
|
||||||
if value != 0:
|
if value != 0:
|
||||||
if value == mask:
|
value_count = value_bytes = None
|
||||||
if count_set_bits(mask) > 1:
|
if value == x.bitmask:
|
||||||
|
if count_set_bits(x.bitmask) > 1:
|
||||||
# If all bits of masked value are set and the mask has more
|
# If all bits of masked value are set and the mask has more
|
||||||
# than one bit, a variable width value will follow after
|
# than one bit, a variable width value will follow after
|
||||||
# the control bytes which defines the length of bytes (NOT
|
# the control bytes which defines the length of bytes (NOT
|
||||||
# the value count!) which will contain the corresponding
|
# the value count!) which will contain the corresponding
|
||||||
# variable width values.
|
# variable width values.
|
||||||
value, consumed = decint(data[data_start:])
|
value_bytes, consumed = decint(data)
|
||||||
data_start += consumed
|
data = data[consumed:]
|
||||||
ptags.append((tag, None, value, values_per_entry))
|
|
||||||
else:
|
else:
|
||||||
ptags.append((tag, 1, None, values_per_entry))
|
value_count = 1
|
||||||
else:
|
else:
|
||||||
# Shift bits to get the masked value.
|
# Shift bits to get the masked value.
|
||||||
while mask & 0x01 == 0:
|
mask = x.bitmask
|
||||||
mask = mask >> 1
|
while mask & 0b1 == 0:
|
||||||
value = value >> 1
|
mask >>= 1
|
||||||
ptags.append((tag, value, None, values_per_entry))
|
value >>= 1
|
||||||
for tag, value_count, value_bytes, values_per_entry in ptags:
|
value_count = value
|
||||||
|
ptags.append(PTagX(x.tag, value_count, value_bytes,
|
||||||
|
x.num_of_values))
|
||||||
|
|
||||||
|
for x in ptags:
|
||||||
values = []
|
values = []
|
||||||
if value_count != None:
|
if x.value_count is not None:
|
||||||
# Read value_count * values_per_entry variable width values.
|
# Read value_count * values_per_entry variable width values.
|
||||||
for _ in xrange(value_count*values_per_entry):
|
for _ in xrange(x.value_count * x.num_of_values):
|
||||||
byts, consumed = decint(data[data_start:])
|
byts, consumed = decint(data)
|
||||||
data_start += consumed
|
data = data[consumed:]
|
||||||
values.append(byts)
|
values.append(byts)
|
||||||
else:
|
else: # value_bytes is not None
|
||||||
# Convert value_bytes to variable width values.
|
# Convert value_bytes to variable width values.
|
||||||
total_consumed = 0
|
total_consumed = 0
|
||||||
while total_consumed < value_bytes:
|
while total_consumed < x.value_bytes:
|
||||||
# Does this work for values_per_entry != 1?
|
# Does this work for values_per_entry != 1?
|
||||||
byts, consumed = decint(data[data_start:])
|
byts, consumed = decint(data)
|
||||||
data_start += consumed
|
data = data[consumed:]
|
||||||
total_consumed += consumed
|
total_consumed += consumed
|
||||||
values.append(byts)
|
values.append(byts)
|
||||||
if total_consumed != value_bytes:
|
if total_consumed != x.value_bytes:
|
||||||
print ("Error: Should consume %s bytes, but consumed %s" %
|
err = ("Error: Should consume %s bytes, but consumed %s" %
|
||||||
(value_bytes, total_consumed))
|
(x.value_bytes, total_consumed))
|
||||||
ans[tag] = values
|
if strict:
|
||||||
# Test that all bytes have been processed if end is given.
|
raise ValueError(err)
|
||||||
if end is not None and data_start < end:
|
else:
|
||||||
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
|
print(err)
|
||||||
rest = data[data_start:end]
|
ans[x.tag] = values
|
||||||
if rest.replace(b'\0', b''):
|
# Test that all bytes have been processed
|
||||||
print ("Warning: There are unprocessed index bytes left: %s" %
|
if data.replace(b'\0', b''):
|
||||||
format_bytes(rest))
|
err = ("Warning: There are unprocessed index bytes left: %s" %
|
||||||
|
format_bytes(data))
|
||||||
|
if strict:
|
||||||
|
raise ValueError(err)
|
||||||
|
else:
|
||||||
|
print(err)
|
||||||
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def read_index(sections, idx, codec):
|
def parse_index_record(table, data, control_byte_count, tags, codec,
|
||||||
table, cncx = OrderedDict(), CNCX([], codec)
|
strict=False):
|
||||||
|
|
||||||
data = sections[idx][0]
|
|
||||||
|
|
||||||
indx_header = parse_indx_header(data)
|
|
||||||
indx_count = indx_header['count']
|
|
||||||
|
|
||||||
if indx_header['ncncx'] > 0:
|
|
||||||
off = idx + indx_count + 1
|
|
||||||
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
|
|
||||||
cncx = CNCX(cncx_records, codec)
|
|
||||||
|
|
||||||
tag_section_start = indx_header['len']
|
|
||||||
control_byte_count, tags = parse_tag_section(data[tag_section_start:])
|
|
||||||
|
|
||||||
for i in xrange(idx + 1, idx + 1 + indx_count):
|
|
||||||
data = sections[i][0]
|
|
||||||
header = parse_indx_header(data)
|
header = parse_indx_header(data)
|
||||||
idxt_pos = header['start']
|
idxt_pos = header['start']
|
||||||
|
if data[idxt_pos:idxt_pos+4] != b'IDXT':
|
||||||
|
print ('WARNING: Invalid INDX record')
|
||||||
entry_count = header['count']
|
entry_count = header['count']
|
||||||
|
|
||||||
# loop through to build up the IDXT position starts
|
# loop through to build up the IDXT position starts
|
||||||
@ -187,11 +183,32 @@ def read_index(sections, idx, codec):
|
|||||||
# text
|
# text
|
||||||
for j in xrange(entry_count):
|
for j in xrange(entry_count):
|
||||||
start, end = idx_positions[j:j+2]
|
start, end = idx_positions[j:j+2]
|
||||||
text_length = ord(data[start])
|
rec = data[start:end]
|
||||||
text = data[start+1:start+1+text_length]
|
ident, consumed = decode_string(rec, codec=codec)
|
||||||
tag_map = get_tag_map(control_byte_count, tags, data,
|
rec = rec[consumed:]
|
||||||
start+1+text_length, end)
|
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
|
||||||
table[text] = tag_map
|
table[ident] = tag_map
|
||||||
|
|
||||||
|
|
||||||
|
def read_index(sections, idx, codec):
|
||||||
|
table, cncx = OrderedDict(), CNCX([], codec)
|
||||||
|
|
||||||
|
data = sections[idx][0]
|
||||||
|
|
||||||
|
indx_header = parse_indx_header(data)
|
||||||
|
indx_count = indx_header['count']
|
||||||
|
|
||||||
|
if indx_header['ncncx'] > 0:
|
||||||
|
off = idx + indx_count + 1
|
||||||
|
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
|
||||||
|
cncx = CNCX(cncx_records, codec)
|
||||||
|
|
||||||
|
tag_section_start = indx_header['len']
|
||||||
|
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
|
||||||
|
|
||||||
|
for i in xrange(idx + 1, idx + 1 + indx_count):
|
||||||
|
# Index record
|
||||||
|
data = sections[i][0]
|
||||||
|
parse_index_record(table, data, control_byte_count, tags, codec)
|
||||||
return table, cncx
|
return table, cncx
|
||||||
|
|
||||||
|
@ -317,6 +317,7 @@ class Mobi8Reader(object):
|
|||||||
for entry in index_entries:
|
for entry in index_entries:
|
||||||
pos = entry['pos']
|
pos = entry['pos']
|
||||||
fi = self.get_file_info(pos)
|
fi = self.get_file_info(pos)
|
||||||
|
#print (11111111, fi, entry['pos_fid'])
|
||||||
if fi.filename is None:
|
if fi.filename is None:
|
||||||
raise ValueError('Index entry has invalid pos: %d'%pos)
|
raise ValueError('Index entry has invalid pos: %d'%pos)
|
||||||
idtag = self.get_id_tag(pos).decode(self.header.codec)
|
idtag = self.get_id_tag(pos).decode(self.header.codec)
|
||||||
|
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.mobi.utils import to_base
|
|
||||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||||
from calibre.ebooks.mobi.reader.index import read_index
|
from calibre.ebooks.mobi.reader.index import read_index
|
||||||
|
|
||||||
@ -23,7 +22,30 @@ tag_fieldname_map = {
|
|||||||
6: ['pos_fid',0],
|
6: ['pos_fid',0],
|
||||||
21: ['parent',0],
|
21: ['parent',0],
|
||||||
22: ['child1',0],
|
22: ['child1',0],
|
||||||
23: ['childn',0]
|
23: ['childn',0],
|
||||||
|
69: ['image_index',0],
|
||||||
|
70 : ['desc_offset', 0], # 'Description offset in cncx'
|
||||||
|
71 : ['author_offset', 0], # 'Author offset in cncx'
|
||||||
|
72 : ['image_caption_offset', 0], # 'Image caption offset in cncx',
|
||||||
|
73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx',
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
default_entry = {
|
||||||
|
'pos': -1,
|
||||||
|
'len': 0,
|
||||||
|
'noffs': -1,
|
||||||
|
'text' : "Unknown Text",
|
||||||
|
'hlvl' : -1,
|
||||||
|
'kind' : "Unknown Class",
|
||||||
|
'pos_fid' : None,
|
||||||
|
'parent' : -1,
|
||||||
|
'child1' : -1,
|
||||||
|
'childn' : -1,
|
||||||
|
'description': None,
|
||||||
|
'author': None,
|
||||||
|
'image_caption': None,
|
||||||
|
'image_attribution': None,
|
||||||
}
|
}
|
||||||
|
|
||||||
def read_ncx(sections, index, codec):
|
def read_ncx(sections, index, codec):
|
||||||
@ -34,32 +56,25 @@ def read_ncx(sections, index, codec):
|
|||||||
|
|
||||||
for num, x in enumerate(table.iteritems()):
|
for num, x in enumerate(table.iteritems()):
|
||||||
text, tag_map = x
|
text, tag_map = x
|
||||||
entry = {
|
entry = default_entry.copy()
|
||||||
'name': text,
|
entry['name'] = text
|
||||||
'pos': -1,
|
entry['num'] = num
|
||||||
'len': 0,
|
|
||||||
'noffs': -1,
|
|
||||||
'text' : "Unknown Text",
|
|
||||||
'hlvl' : -1,
|
|
||||||
'kind' : "Unknown Kind",
|
|
||||||
'pos_fid' : None,
|
|
||||||
'parent' : -1,
|
|
||||||
'child1' : -1,
|
|
||||||
'childn' : -1,
|
|
||||||
'num' : num
|
|
||||||
}
|
|
||||||
|
|
||||||
for tag in tag_fieldname_map.keys():
|
for tag in tag_fieldname_map.iterkeys():
|
||||||
fieldname, i = tag_fieldname_map[tag]
|
fieldname, i = tag_fieldname_map[tag]
|
||||||
if tag in tag_map:
|
if tag in tag_map:
|
||||||
fieldvalue = tag_map[tag][i]
|
fieldvalue = tag_map[tag][i]
|
||||||
if tag == 6:
|
if tag == 6:
|
||||||
fieldvalue = to_base(fieldvalue, base=32)
|
# Appears to be an idx into the KF8 elems table with an
|
||||||
|
# offset
|
||||||
|
fieldvalue = tuple(tag_map[tag])
|
||||||
entry[fieldname] = fieldvalue
|
entry[fieldname] = fieldvalue
|
||||||
if tag == 3:
|
for which, name in {3:'text', 5:'kind', 70:'description',
|
||||||
entry['text'] = cncx.get(fieldvalue, 'Unknown Text')
|
71:'author', 72:'image_caption',
|
||||||
if tag == 5:
|
73:'image_attribution'}.iteritems():
|
||||||
entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind')
|
if tag == which:
|
||||||
|
entry[name] = cncx.get(fieldvalue,
|
||||||
|
default_entry[name])
|
||||||
index_entries.append(entry)
|
index_entries.append(entry)
|
||||||
|
|
||||||
return index_entries
|
return index_entries
|
||||||
|
@ -15,7 +15,13 @@ from calibre.ebooks import normalize
|
|||||||
|
|
||||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||||
|
|
||||||
def decode_hex_number(raw):
|
def decode_string(raw, codec='utf-8'):
|
||||||
|
length, = struct.unpack(b'>B', raw[0])
|
||||||
|
raw = raw[1:1+length]
|
||||||
|
consumed = length+1
|
||||||
|
return raw.decode(codec), consumed
|
||||||
|
|
||||||
|
def decode_hex_number(raw, codec='utf-8'):
|
||||||
'''
|
'''
|
||||||
Return a variable length number encoded using hexadecimal encoding. These
|
Return a variable length number encoded using hexadecimal encoding. These
|
||||||
numbers have the first byte which tells the number of bytes that follow.
|
numbers have the first byte which tells the number of bytes that follow.
|
||||||
@ -25,13 +31,16 @@ def decode_hex_number(raw):
|
|||||||
:param raw: Raw binary data as a bytestring
|
:param raw: Raw binary data as a bytestring
|
||||||
|
|
||||||
:return: The number and the number of bytes from raw that the number
|
:return: The number and the number of bytes from raw that the number
|
||||||
occupies
|
occupies.
|
||||||
'''
|
'''
|
||||||
length, = struct.unpack(b'>B', raw[0])
|
raw, consumed = decode_string(raw, codec=codec)
|
||||||
raw = raw[1:1+length]
|
|
||||||
consumed = length+1
|
|
||||||
return int(raw, 16), consumed
|
return int(raw, 16), consumed
|
||||||
|
|
||||||
|
def encode_string(raw):
|
||||||
|
ans = bytearray(bytes(raw))
|
||||||
|
ans.insert(0, len(ans))
|
||||||
|
return bytes(ans)
|
||||||
|
|
||||||
def encode_number_as_hex(num):
|
def encode_number_as_hex(num):
|
||||||
'''
|
'''
|
||||||
Encode num as a variable length encoded hexadecimal number. Returns the
|
Encode num as a variable length encoded hexadecimal number. Returns the
|
||||||
@ -44,9 +53,7 @@ def encode_number_as_hex(num):
|
|||||||
nlen = len(num)
|
nlen = len(num)
|
||||||
if nlen % 2 != 0:
|
if nlen % 2 != 0:
|
||||||
num = b'0'+num
|
num = b'0'+num
|
||||||
ans = bytearray(num)
|
return encode_string(num)
|
||||||
ans.insert(0, len(num))
|
|
||||||
return bytes(ans)
|
|
||||||
|
|
||||||
def encint(value, forward=True):
|
def encint(value, forward=True):
|
||||||
'''
|
'''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user