From 60ec978a8f632e9d7b83f05ebe145353b8763f10 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jun 2012 15:12:55 +0530 Subject: [PATCH] KF8 Input: Handle files with oncorrectly encoded guide type entries. Fixes #1015020 (azw3 file "invalid start byte") --- src/calibre/ebooks/mobi/reader/index.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py index c732d8862e..6086d547bf 100644 --- a/src/calibre/ebooks/mobi/reader/index.py +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -224,7 +224,18 @@ def parse_index_record(table, data, control_byte_count, tags, codec, for j in xrange(entry_count): start, end = idx_positions[j:j+2] rec = data[start:end] - ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map) + # Sometimes (in the guide table if the type attribute has non ascii + # values) the ident is UTF-16 encoded. Try to handle that. + try: + ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map) + except UnicodeDecodeError: + ident, consumed = decode_string(rec, codec='utf-16', ordt_map=ordt_map) + if u'\x00' in ident: + try: + ident, consumed = decode_string(rec, codec='utf-16', + ordt_map=ordt_map) + except UnicodeDecodeError: + ident = ident.replace('u\x00', u'') rec = rec[consumed:] tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict) table[ident] = tag_map