AZW3 Input: Handle AZW3 files with incorrect TAGX Offset INDX header fields. Fixes #1955308 [MOBI reader loads TAGX from wrong offset](https://bugs.launchpad.net/calibre/+bug/1955308)

2025-07-09 03:04:10 -04:00 · 2021-12-19 08:56:36 +05:30 · 2021-12-19 08:56:36 +05:30 · aec2c1a551
commit aec2c1a551
parent 172ee5d531
2 changed files with 17 additions and 6 deletions
--- a/src/calibre/ebooks/mobi/debug/index.py
+++ b/src/calibre/ebooks/mobi/debug/index.py
@ -7,13 +7,14 @@ __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 import struct
-
 from collections import OrderedDict, namedtuple

 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
-from calibre.ebooks.mobi.reader.index import (CNCX, parse_indx_header,
-        parse_tagx_section, parse_index_record, INDEX_HEADER_FIELDS)
-from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
+from calibre.ebooks.mobi.reader.index import (
+    CNCX, INDEX_HEADER_FIELDS, get_tag_section_start, parse_index_record,
+    parse_indx_header, parse_tagx_section
+)
+from calibre.ebooks.mobi.reader.ncx import default_entry, tag_fieldname_map
 from polyglot.builtins import iteritems

 File = namedtuple('File',
@ -71,7 +72,7 @@ def read_index(sections, idx, codec):
        cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]]
        cncx = CNCX(cncx_records, codec)

-    tag_section_start = indx_header['tagx']
+    tag_section_start = get_tag_section_start(data, indx_header)
    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])

    read_variable_len_data(data, indx_header)
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -51,6 +51,7 @@ def parse_indx_header(data):
    num = len(words)
    values = struct.unpack('>%dL' % num, data[4:4*(num+1)])
    ans = dict(zip(words, values))
+    ans['idx_header_end_pos'] = 4 * (num+1)
    ordt1, ordt2 = ans['ordt1'], ans['ordt2']
    ans['ordt1_raw'], ans['ordt2_raw'] = [], []
    ans['ordt_map'] = ''
@ -253,6 +254,15 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
    return header


+def get_tag_section_start(data, indx_header):
+    tag_section_start = indx_header['tagx']
+    if data[tag_section_start:tag_section_start + 4] != b'TAGX':
+        tpos = data.find(b'TAGX', indx_header['idx_header_end_pos'])
+        if tpos > -1:
+            tag_section_start = tpos
+    return tag_section_start
+
+
 def read_index(sections, idx, codec):
    table, cncx = OrderedDict(), CNCX([], codec)

@ -266,7 +276,7 @@ def read_index(sections, idx, codec):
        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
        cncx = CNCX(cncx_records, codec)

-    tag_section_start = indx_header['tagx']
+    tag_section_start = get_tag_section_start(data, indx_header)
    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])

    for i in range(idx + 1, idx + 1 + indx_count):