KF8 Input: Support KF8 format Amazon samples. Fixes #963418 (UnicodeDecodeError invalid start byte when displaying KF8 ebook)

This commit is contained in:
Kovid Goyal 2012-03-26 13:56:12 +05:30
parent aba3f4686f
commit 23f9bdc7c9
3 changed files with 48 additions and 9 deletions

View File

@ -39,10 +39,43 @@ def parse_indx_header(data):
words = ( words = (
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
) ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
'ordt1', 'ordt2', 'tagx')
num = len(words) num = len(words)
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)]) values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
return dict(zip(words, values)) ans = dict(zip(words, values))
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
ans['ordt_map'] = ''
if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
# I dont know what this is, but using it seems to be unnecessary, so
# just leave it as the raw bytestring
ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
if ans['code'] == 65002:
# This appears to be EBCDIC-UTF (65002) encoded. I can't be
# bothered to write a decoder for this (see
# http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
# Instead I use a half assed decoder that decodes only the ascii
# valid values correctly. Hopefully these ORDT sections will only
# ever be used in SKEL and ELEM indices where the text is pure
# ASCII. EBCDIC-UTF and ASCII have the same. Any non ASCII valid
# values are mapped to the ? character
parsed = bytearray(ans['oentries'])
for i in xrange(0, 2*ans['oentries'], 2):
if 0x20 < raw[i+1] < 0x7f:
parsed[i//2] = raw[i+1]
else:
parsed[i//2] = ord(b'?')
ans['ordt_map'] = bytes(parsed).decode('ascii')
else:
ans['ordt_map'] = '?'*ans['oentries']
return ans
class CNCX(object): # {{{ class CNCX(object): # {{{
@ -163,7 +196,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False):
return ans return ans
def parse_index_record(table, data, control_byte_count, tags, codec, def parse_index_record(table, data, control_byte_count, tags, codec,
strict=False): ordt_map, strict=False):
header = parse_indx_header(data) header = parse_indx_header(data)
idxt_pos = header['start'] idxt_pos = header['start']
if data[idxt_pos:idxt_pos+4] != b'IDXT': if data[idxt_pos:idxt_pos+4] != b'IDXT':
@ -184,12 +217,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
for j in xrange(entry_count): for j in xrange(entry_count):
start, end = idx_positions[j:j+2] start, end = idx_positions[j:j+2]
rec = data[start:end] rec = data[start:end]
ident, consumed = decode_string(rec, codec=codec) ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
rec = rec[consumed:] rec = rec[consumed:]
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict) tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
table[ident] = tag_map table[ident] = tag_map
def read_index(sections, idx, codec): def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec) table, cncx = OrderedDict(), CNCX([], codec)
@ -203,12 +235,13 @@ def read_index(sections, idx, codec):
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]] cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
cncx = CNCX(cncx_records, codec) cncx = CNCX(cncx_records, codec)
tag_section_start = indx_header['len'] tag_section_start = indx_header['tagx']
control_byte_count, tags = parse_tagx_section(data[tag_section_start:]) control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
for i in xrange(idx + 1, idx + 1 + indx_count): for i in xrange(idx + 1, idx + 1 + indx_count):
# Index record # Index record
data = sections[i][0] data = sections[i][0]
parse_index_record(table, data, control_byte_count, tags, codec) parse_index_record(table, data, control_byte_count, tags, codec,
indx_header['ordt_map'])
return table, cncx return table, cncx

View File

@ -285,7 +285,11 @@ class Mobi8Reader(object):
def create_guide(self): def create_guide(self):
guide = Guide() guide = Guide()
for ref_type, ref_title, fileno in self.guide: for ref_type, ref_title, fileno in self.guide:
elem = self.elems[fileno] try:
elem = self.elems[fileno]
except IndexError:
# Happens for thumbnailstandard in Amazon book samples
continue
fi = self.get_file_info(elem.insert_pos) fi = self.get_file_info(elem.insert_pos)
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec) idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
linktgt = fi.filename linktgt = fi.filename

View File

@ -15,10 +15,12 @@ from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024 IMAGE_MAX_SIZE = 10 * 1024 * 1024
def decode_string(raw, codec='utf-8'): def decode_string(raw, codec='utf-8', ordt_map=''):
length, = struct.unpack(b'>B', raw[0]) length, = struct.unpack(b'>B', raw[0])
raw = raw[1:1+length] raw = raw[1:1+length]
consumed = length+1 consumed = length+1
if ordt_map:
return ''.join(ordt_map[ord(x)] for x in raw), consumed
return raw.decode(codec), consumed return raw.decode(codec), consumed
def decode_hex_number(raw, codec='utf-8'): def decode_hex_number(raw, codec='utf-8'):