Dump the index record geometry by using the IDXT table

This commit is contained in:
Kovid Goyal 2014-03-29 15:19:31 +05:30
parent 624b39a32e
commit df0f4106a5

View File

@ -25,35 +25,36 @@ Elem = namedtuple('Chunk',
GuideRef = namedtuple('GuideRef', 'type title pos_fid')
INDEX_HEADER_FIELDS = INDEX_HEADER_FIELDS + ('last_index', 'tagx_block_size', 'tagx_block')
INDEX_HEADER_FIELDS = INDEX_HEADER_FIELDS + ('indices', 'tagx_block_size', 'tagx_block')
FIELD_NAMES = {'len':'Header length', 'type':'Unknown', 'gen':'Index Type (0 - normal, 2 - inflection)',
'start':'IDXT Offset', 'count':'Number of Index entries or records', 'code': 'character encoding', 'lng':'Unknown',
'total':'Total number of Index Entries in all records', 'ordt': 'ORDT Offset', 'ligt':'LIGT Offset', 'nligt':'Number of LIGT',
'ncncx':'Number of CNCX records', 'last_index':'Geometry of index records'}
'start':'IDXT Offset', 'count':'Number of entries in this record', 'code': 'character encoding', 'lng':'Unknown',
'total':'Total number of actual Index Entries in all records', 'ordt': 'ORDT Offset', 'ligt':'LIGT Offset', 'nligt':'Number of LIGT',
'ncncx':'Number of CNCX records', 'indices':'Geometry of index records'}
def read_variable_len_data(data, header):
offset = header['tagx']
indices = []
idxt_offset = header['start']
idxt_size = 4 + header['count'] * 2
if offset > 0:
tagx_block_size = header['tagx_block_size'] = struct.unpack_from(b'>I', data, offset + 4)[0]
header['tagx_block'] = data[offset:offset+tagx_block_size]
offset += tagx_block_size
offset = idxt_offset + 4
for i in xrange(header['count']):
strlen = bytearray(data[offset:offset+1])[0]
text = data[offset+1:offset+1+strlen].decode('ascii')
offset += 1 + strlen
num = struct.unpack_from(b'>H', data, offset)[0]
p = struct.unpack_from(b'>H', data, offset)[0]
offset += 2
strlen = bytearray(data[p])[0]
text = data[p+1:p+1+strlen]
p += 1 + strlen
num = struct.unpack_from(b'>H', data, p)[0]
indices.append((text, num))
else:
header['tagx_block'] = b''
header['tagx_block_size'] = 0
idxt_offset = header['start']
idxt_size = 4 + header['count'] * 2
trailing_bytes = data[idxt_offset+idxt_size:]
if trailing_bytes.rstrip(b'\0'):
raise ValueError('Traling bytes after last IDXT entry: %r' % trailing_bytes.rstrip(b'\0'))
header['last_index'] = indices
header['indices'] = indices
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)