KF8 Input: Support KF8 format Amazon samples. Fixes #963418 (UnicodeDecodeError invalid start byte when displaying KF8 ebook)

This commit is contained in:
Kovid Goyal 2012-03-26 13:56:12 +05:30
parent aba3f4686f
commit 23f9bdc7c9
3 changed files with 48 additions and 9 deletions

View File

@ -39,10 +39,43 @@ def parse_indx_header(data):
words = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
)
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
num = len(words)
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
return dict(zip(words, values))
ans = dict(zip(words, values))
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
ans['ordt_map'] = ''
if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
# I dont know what this is, but using it seems to be unnecessary, so
# just leave it as the raw bytestring
ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
if ans['code'] == 65002:
# This appears to be EBCDIC-UTF (65002) encoded. I can't be
# bothered to write a decoder for this (see
# http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
# Instead I use a half assed decoder that decodes only the ascii
# valid values correctly. Hopefully these ORDT sections will only
# ever be used in SKEL and ELEM indices where the text is pure
# ASCII. EBCDIC-UTF and ASCII have the same. Any non ASCII valid
# values are mapped to the ? character
parsed = bytearray(ans['oentries'])
for i in xrange(0, 2*ans['oentries'], 2):
if 0x20 < raw[i+1] < 0x7f:
parsed[i//2] = raw[i+1]
else:
parsed[i//2] = ord(b'?')
ans['ordt_map'] = bytes(parsed).decode('ascii')
else:
ans['ordt_map'] = '?'*ans['oentries']
return ans
class CNCX(object): # {{{
@ -163,7 +196,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False):
return ans
def parse_index_record(table, data, control_byte_count, tags, codec,
strict=False):
ordt_map, strict=False):
header = parse_indx_header(data)
idxt_pos = header['start']
if data[idxt_pos:idxt_pos+4] != b'IDXT':
@ -184,12 +217,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
for j in xrange(entry_count):
start, end = idx_positions[j:j+2]
rec = data[start:end]
ident, consumed = decode_string(rec, codec=codec)
ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
rec = rec[consumed:]
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
table[ident] = tag_map
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
@ -203,12 +235,13 @@ def read_index(sections, idx, codec):
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['len']
tag_section_start = indx_header['tagx']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count):
# Index record
data = sections[i][0]
parse_index_record(table, data, control_byte_count, tags, codec)
parse_index_record(table, data, control_byte_count, tags, codec,
indx_header['ordt_map'])
return table, cncx

View File

@ -285,7 +285,11 @@ class Mobi8Reader(object):
def create_guide(self):
guide = Guide()
for ref_type, ref_title, fileno in self.guide:
elem = self.elems[fileno]
try:
elem = self.elems[fileno]
except IndexError:
# Happens for thumbnailstandard in Amazon book samples
continue
fi = self.get_file_info(elem.insert_pos)
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
linktgt = fi.filename

View File

@ -15,10 +15,12 @@ from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024
def decode_string(raw, codec='utf-8'):
def decode_string(raw, codec='utf-8', ordt_map=''):
length, = struct.unpack(b'>B', raw[0])
raw = raw[1:1+length]
consumed = length+1
if ordt_map:
return ''.join(ordt_map[ord(x)] for x in raw), consumed
return raw.decode(codec), consumed
def decode_hex_number(raw, codec='utf-8'):