AZW3 Input: Handle AZW3 files with incorrect TAGX Offset INDX header fields. Fixes #1955308 [MOBI reader loads TAGX from wrong offset](https://bugs.launchpad.net/calibre/+bug/1955308)

This commit is contained in:
Kovid Goyal 2021-12-19 08:56:36 +05:30
parent 172ee5d531
commit aec2c1a551
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 17 additions and 6 deletions

View File

@ -7,13 +7,14 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct
from collections import OrderedDict, namedtuple
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header,
parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS)
from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
from calibre.ebooks.mobi.reader.index import (
CNCX, INDEX_HEADER_FIELDS, get_tag_section_start, parse_index_record,
parse_indx_header, parse_tagx_section
)
from calibre.ebooks.mobi.reader.ncx import default_entry, tag_fieldname_map
from polyglot.builtins import iteritems
File = namedtuple('File',
@ -71,7 +72,7 @@ def read_index(sections, idx, codec):
cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['tagx']
tag_section_start = get_tag_section_start(data, indx_header)
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
read_variable_len_data(data, indx_header)

View File

@ -51,6 +51,7 @@ def parse_indx_header(data):
num = len(words)
values = struct.unpack('>%dL' % num, data[4:4*(num+1)])
ans = dict(zip(words, values))
ans['idx_header_end_pos'] = 4 * (num+1)
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
ans['ordt_map'] = ''
@ -253,6 +254,15 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
return header
def get_tag_section_start(data, indx_header):
tag_section_start = indx_header['tagx']
if data[tag_section_start:tag_section_start + 4] != b'TAGX':
tpos = data.find(b'TAGX', indx_header['idx_header_end_pos'])
if tpos > -1:
tag_section_start = tpos
return tag_section_start
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
@ -266,7 +276,7 @@ def read_index(sections, idx, codec):
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['tagx']
tag_section_start = get_tag_section_start(data, indx_header)
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in range(idx + 1, idx + 1 + indx_count):