From 23f9bdc7c90133bd7e6c02aaaa577354c92ac9a5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 26 Mar 2012 13:56:12 +0530 Subject: [PATCH] KF8 Input: Support KF8 format Amazon samples. Fixes #963418 (UnicodeDecodeError invalid start byte when displaying KF8 ebook) --- src/calibre/ebooks/mobi/reader/index.py | 47 +++++++++++++++++++++---- src/calibre/ebooks/mobi/reader/mobi8.py | 6 +++- src/calibre/ebooks/mobi/utils.py | 4 ++- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py index dd85b5a5cb..1979458b2a 100644 --- a/src/calibre/ebooks/mobi/reader/index.py +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -39,10 +39,43 @@ def parse_indx_header(data): words = ( 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' - ) + ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries', + 'ordt1', 'ordt2', 'tagx') num = len(words) values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)]) - return dict(zip(words, values)) + ans = dict(zip(words, values)) + ordt1, ordt2 = ans['ordt1'], ans['ordt2'] + ans['ordt1_raw'], ans['ordt2_raw'] = [], [] + ans['ordt_map'] = '' + + if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT': + # I dont know what this is, but using it seems to be unnecessary, so + # just leave it as the raw bytestring + ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']] + if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT': + ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']]) + if ans['code'] == 65002: + # This appears to be EBCDIC-UTF (65002) encoded. I can't be + # bothered to write a decoder for this (see + # http://www.unicode.org/reports/tr16/) Just how stupid is Amazon? + # Instead I use a half assed decoder that decodes only the ascii + # valid values correctly. Hopefully these ORDT sections will only + # ever be used in SKEL and ELEM indices where the text is pure + # ASCII. EBCDIC-UTF and ASCII have the same. Any non ASCII valid + # values are mapped to the ? character + + parsed = bytearray(ans['oentries']) + for i in xrange(0, 2*ans['oentries'], 2): + if 0x20 < raw[i+1] < 0x7f: + parsed[i//2] = raw[i+1] + else: + parsed[i//2] = ord(b'?') + ans['ordt_map'] = bytes(parsed).decode('ascii') + else: + ans['ordt_map'] = '?'*ans['oentries'] + + return ans + class CNCX(object): # {{{ @@ -163,7 +196,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False): return ans def parse_index_record(table, data, control_byte_count, tags, codec, - strict=False): + ordt_map, strict=False): header = parse_indx_header(data) idxt_pos = header['start'] if data[idxt_pos:idxt_pos+4] != b'IDXT': @@ -184,12 +217,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec, for j in xrange(entry_count): start, end = idx_positions[j:j+2] rec = data[start:end] - ident, consumed = decode_string(rec, codec=codec) + ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map) rec = rec[consumed:] tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict) table[ident] = tag_map - def read_index(sections, idx, codec): table, cncx = OrderedDict(), CNCX([], codec) @@ -203,12 +235,13 @@ def read_index(sections, idx, codec): cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]] cncx = CNCX(cncx_records, codec) - tag_section_start = indx_header['len'] + tag_section_start = indx_header['tagx'] control_byte_count, tags = parse_tagx_section(data[tag_section_start:]) for i in xrange(idx + 1, idx + 1 + indx_count): # Index record data = sections[i][0] - parse_index_record(table, data, control_byte_count, tags, codec) + parse_index_record(table, data, control_byte_count, tags, codec, + indx_header['ordt_map']) return table, cncx diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index ec7166ebb0..d2254e00d8 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -285,7 +285,11 @@ class Mobi8Reader(object): def create_guide(self): guide = Guide() for ref_type, ref_title, fileno in self.guide: - elem = self.elems[fileno] + try: + elem = self.elems[fileno] + except IndexError: + # Happens for thumbnailstandard in Amazon book samples + continue fi = self.get_file_info(elem.insert_pos) idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec) linktgt = fi.filename diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 4c1e52e119..3530736ba0 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -15,10 +15,12 @@ from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 -def decode_string(raw, codec='utf-8'): +def decode_string(raw, codec='utf-8', ordt_map=''): length, = struct.unpack(b'>B', raw[0]) raw = raw[1:1+length] consumed = length+1 + if ordt_map: + return ''.join(ordt_map[ord(x)] for x in raw), consumed return raw.decode(codec), consumed def decode_hex_number(raw, codec='utf-8'):