KF8 Input: Support KF8 format Amazon samples. Fixes #963418 (UnicodeDecodeError invalid start byte when displaying KF8 ebook)

2025-07-09 03:04:10 -04:00 · 2012-03-26 13:56:12 +05:30 · 2012-03-26 13:56:12 +05:30 · 23f9bdc7c9
commit 23f9bdc7c9
parent aba3f4686f
3 changed files with 48 additions and 9 deletions
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -39,10 +39,43 @@ def parse_indx_header(data):
    words = (
            'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
-    )
+    ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
+            'ordt1', 'ordt2', 'tagx')
    num = len(words)
    values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
-    return dict(zip(words, values))
+    ans = dict(zip(words, values))
+    ordt1, ordt2 = ans['ordt1'], ans['ordt2']
+    ans['ordt1_raw'], ans['ordt2_raw'] = [], []
+    ans['ordt_map'] = ''
+
+    if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
+        # I dont know what this is, but using it seems to be unnecessary, so
+        # just leave it as the raw bytestring
+        ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
+    if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
+        ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
+        if ans['code'] == 65002:
+            # This appears to be EBCDIC-UTF (65002) encoded. I can't be
+            # bothered to write a decoder for this (see
+            # http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
+            # Instead I use a half assed decoder that decodes only the ascii
+            # valid values correctly.  Hopefully these ORDT sections will only
+            # ever be used in SKEL and ELEM indices where the text is pure
+            # ASCII. EBCDIC-UTF and ASCII have the same. Any non ASCII valid
+            # values are mapped to the ? character
+
+            parsed = bytearray(ans['oentries'])
+            for i in xrange(0, 2*ans['oentries'], 2):
+                if 0x20 < raw[i+1] < 0x7f:
+                    parsed[i//2] = raw[i+1]
+                else:
+                    parsed[i//2] = ord(b'?')
+            ans['ordt_map'] = bytes(parsed).decode('ascii')
+        else:
+            ans['ordt_map'] = '?'*ans['oentries']
+
+    return ans
+

 class CNCX(object): # {{{

@ -163,7 +196,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False):
    return ans

 def parse_index_record(table, data, control_byte_count, tags, codec,
-        strict=False):
+        ordt_map, strict=False):
    header = parse_indx_header(data)
    idxt_pos = header['start']
    if data[idxt_pos:idxt_pos+4] != b'IDXT':
@ -184,12 +217,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
    for j in xrange(entry_count):
        start, end = idx_positions[j:j+2]
        rec = data[start:end]
-        ident, consumed = decode_string(rec, codec=codec)
+        ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
        rec = rec[consumed:]
        tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
        table[ident] = tag_map

-
 def read_index(sections, idx, codec):
    table, cncx = OrderedDict(), CNCX([], codec)

@ -203,12 +235,13 @@ def read_index(sections, idx, codec):
        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
        cncx = CNCX(cncx_records, codec)

-    tag_section_start = indx_header['len']
+    tag_section_start = indx_header['tagx']
    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])

    for i in xrange(idx + 1, idx + 1 + indx_count):
        # Index record
        data = sections[i][0]
-        parse_index_record(table, data, control_byte_count, tags, codec)
+        parse_index_record(table, data, control_byte_count, tags, codec,
+                indx_header['ordt_map'])
    return table, cncx

--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -285,7 +285,11 @@ class Mobi8Reader(object):
    def create_guide(self):
        guide = Guide()
        for ref_type, ref_title, fileno in self.guide:
-            elem = self.elems[fileno]
+            try:
+                elem = self.elems[fileno]
+            except IndexError:
+                # Happens for thumbnailstandard in Amazon book samples
+                continue
            fi = self.get_file_info(elem.insert_pos)
            idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
            linktgt = fi.filename
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -15,10 +15,12 @@ from calibre.ebooks import normalize

 IMAGE_MAX_SIZE = 10 * 1024 * 1024

-def decode_string(raw, codec='utf-8'):
+def decode_string(raw, codec='utf-8', ordt_map=''):
    length, = struct.unpack(b'>B', raw[0])
    raw = raw[1:1+length]
    consumed = length+1
+    if ordt_map:
+        return ''.join(ordt_map[ord(x)] for x in raw), consumed
    return raw.decode(codec), consumed

 def decode_hex_number(raw, codec='utf-8'):