From 23f9bdc7c90133bd7e6c02aaaa577354c92ac9a5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 26 Mar 2012 13:56:12 +0530
Subject: [PATCH] KF8 Input: Support KF8 format Amazon samples. Fixes #963418
 (UnicodeDecodeError invalid start byte when displaying KF8 ebook)

---
 src/calibre/ebooks/mobi/reader/index.py | 47 +++++++++++++++++++++----
 src/calibre/ebooks/mobi/reader/mobi8.py |  6 +++-
 src/calibre/ebooks/mobi/utils.py        |  4 ++-
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py
index dd85b5a5cb..1979458b2a 100644
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@@ -39,10 +39,43 @@ def parse_indx_header(data):
     words = (
             'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
             'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
-    )
+    ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
+            'ordt1', 'ordt2', 'tagx')
     num = len(words)
     values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
-    return dict(zip(words, values))
+    ans = dict(zip(words, values))
+    ordt1, ordt2 = ans['ordt1'], ans['ordt2']
+    ans['ordt1_raw'], ans['ordt2_raw'] = [], []
+    ans['ordt_map'] = ''
+
+    if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
+        # I dont know what this is, but using it seems to be unnecessary, so
+        # just leave it as the raw bytestring
+        ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
+    if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
+        ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
+        if ans['code'] == 65002:
+            # This appears to be EBCDIC-UTF (65002) encoded. I can't be
+            # bothered to write a decoder for this (see
+            # http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
+            # Instead I use a half assed decoder that decodes only the ascii
+            # valid values correctly.  Hopefully these ORDT sections will only
+            # ever be used in SKEL and ELEM indices where the text is pure
+            # ASCII. EBCDIC-UTF and ASCII have the same. Any non ASCII valid
+            # values are mapped to the ? character
+
+            parsed = bytearray(ans['oentries'])
+            for i in xrange(0, 2*ans['oentries'], 2):
+                if 0x20 < raw[i+1] < 0x7f:
+                    parsed[i//2] = raw[i+1]
+                else:
+                    parsed[i//2] = ord(b'?')
+            ans['ordt_map'] = bytes(parsed).decode('ascii')
+        else:
+            ans['ordt_map'] = '?'*ans['oentries']
+
+    return ans
+
 
 class CNCX(object): # {{{
 
@@ -163,7 +196,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False):
     return ans
 
 def parse_index_record(table, data, control_byte_count, tags, codec,
-        strict=False):
+        ordt_map, strict=False):
     header = parse_indx_header(data)
     idxt_pos = header['start']
     if data[idxt_pos:idxt_pos+4] != b'IDXT':
@@ -184,12 +217,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
     for j in xrange(entry_count):
         start, end = idx_positions[j:j+2]
         rec = data[start:end]
-        ident, consumed = decode_string(rec, codec=codec)
+        ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
         rec = rec[consumed:]
         tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
         table[ident] = tag_map
 
-
 def read_index(sections, idx, codec):
     table, cncx = OrderedDict(), CNCX([], codec)
 
@@ -203,12 +235,13 @@ def read_index(sections, idx, codec):
         cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
         cncx = CNCX(cncx_records, codec)
 
-    tag_section_start = indx_header['len']
+    tag_section_start = indx_header['tagx']
     control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
 
     for i in xrange(idx + 1, idx + 1 + indx_count):
         # Index record
         data = sections[i][0]
-        parse_index_record(table, data, control_byte_count, tags, codec)
+        parse_index_record(table, data, control_byte_count, tags, codec,
+                indx_header['ordt_map'])
     return table, cncx
 
diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py
index ec7166ebb0..d2254e00d8 100644
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@@ -285,7 +285,11 @@ class Mobi8Reader(object):
     def create_guide(self):
         guide = Guide()
         for ref_type, ref_title, fileno in self.guide:
-            elem = self.elems[fileno]
+            try:
+                elem = self.elems[fileno]
+            except IndexError:
+                # Happens for thumbnailstandard in Amazon book samples
+                continue
             fi = self.get_file_info(elem.insert_pos)
             idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
             linktgt = fi.filename
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index 4c1e52e119..3530736ba0 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -15,10 +15,12 @@ from calibre.ebooks import normalize
 
 IMAGE_MAX_SIZE = 10 * 1024 * 1024
 
-def decode_string(raw, codec='utf-8'):
+def decode_string(raw, codec='utf-8', ordt_map=''):
     length, = struct.unpack(b'>B', raw[0])
     raw = raw[1:1+length]
     consumed = length+1
+    if ordt_map:
+        return ''.join(ordt_map[ord(x)] for x in raw), consumed
     return raw.decode(codec), consumed
 
 def decode_hex_number(raw, codec='utf-8'):