mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8 Input: Support KF8 format Amazon samples. Fixes #963418 (UnicodeDecodeError invalid start byte when displaying KF8 ebook)
This commit is contained in:
parent
aba3f4686f
commit
23f9bdc7c9
@ -39,10 +39,43 @@ def parse_indx_header(data):
|
||||
words = (
|
||||
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||
)
|
||||
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
|
||||
'ordt1', 'ordt2', 'tagx')
|
||||
num = len(words)
|
||||
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
||||
return dict(zip(words, values))
|
||||
ans = dict(zip(words, values))
|
||||
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
|
||||
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
|
||||
ans['ordt_map'] = ''
|
||||
|
||||
if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
|
||||
# I dont know what this is, but using it seems to be unnecessary, so
|
||||
# just leave it as the raw bytestring
|
||||
ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
|
||||
if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
|
||||
ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
|
||||
if ans['code'] == 65002:
|
||||
# This appears to be EBCDIC-UTF (65002) encoded. I can't be
|
||||
# bothered to write a decoder for this (see
|
||||
# http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
|
||||
# Instead I use a half assed decoder that decodes only the ascii
|
||||
# valid values correctly. Hopefully these ORDT sections will only
|
||||
# ever be used in SKEL and ELEM indices where the text is pure
|
||||
# ASCII. EBCDIC-UTF and ASCII have the same. Any non ASCII valid
|
||||
# values are mapped to the ? character
|
||||
|
||||
parsed = bytearray(ans['oentries'])
|
||||
for i in xrange(0, 2*ans['oentries'], 2):
|
||||
if 0x20 < raw[i+1] < 0x7f:
|
||||
parsed[i//2] = raw[i+1]
|
||||
else:
|
||||
parsed[i//2] = ord(b'?')
|
||||
ans['ordt_map'] = bytes(parsed).decode('ascii')
|
||||
else:
|
||||
ans['ordt_map'] = '?'*ans['oentries']
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
@ -163,7 +196,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False):
|
||||
return ans
|
||||
|
||||
def parse_index_record(table, data, control_byte_count, tags, codec,
|
||||
strict=False):
|
||||
ordt_map, strict=False):
|
||||
header = parse_indx_header(data)
|
||||
idxt_pos = header['start']
|
||||
if data[idxt_pos:idxt_pos+4] != b'IDXT':
|
||||
@ -184,12 +217,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
|
||||
for j in xrange(entry_count):
|
||||
start, end = idx_positions[j:j+2]
|
||||
rec = data[start:end]
|
||||
ident, consumed = decode_string(rec, codec=codec)
|
||||
ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
|
||||
rec = rec[consumed:]
|
||||
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
|
||||
table[ident] = tag_map
|
||||
|
||||
|
||||
def read_index(sections, idx, codec):
|
||||
table, cncx = OrderedDict(), CNCX([], codec)
|
||||
|
||||
@ -203,12 +235,13 @@ def read_index(sections, idx, codec):
|
||||
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
|
||||
cncx = CNCX(cncx_records, codec)
|
||||
|
||||
tag_section_start = indx_header['len']
|
||||
tag_section_start = indx_header['tagx']
|
||||
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
|
||||
|
||||
for i in xrange(idx + 1, idx + 1 + indx_count):
|
||||
# Index record
|
||||
data = sections[i][0]
|
||||
parse_index_record(table, data, control_byte_count, tags, codec)
|
||||
parse_index_record(table, data, control_byte_count, tags, codec,
|
||||
indx_header['ordt_map'])
|
||||
return table, cncx
|
||||
|
||||
|
@ -285,7 +285,11 @@ class Mobi8Reader(object):
|
||||
def create_guide(self):
|
||||
guide = Guide()
|
||||
for ref_type, ref_title, fileno in self.guide:
|
||||
elem = self.elems[fileno]
|
||||
try:
|
||||
elem = self.elems[fileno]
|
||||
except IndexError:
|
||||
# Happens for thumbnailstandard in Amazon book samples
|
||||
continue
|
||||
fi = self.get_file_info(elem.insert_pos)
|
||||
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
|
||||
linktgt = fi.filename
|
||||
|
@ -15,10 +15,12 @@ from calibre.ebooks import normalize
|
||||
|
||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||
|
||||
def decode_string(raw, codec='utf-8'):
|
||||
def decode_string(raw, codec='utf-8', ordt_map=''):
|
||||
length, = struct.unpack(b'>B', raw[0])
|
||||
raw = raw[1:1+length]
|
||||
consumed = length+1
|
||||
if ordt_map:
|
||||
return ''.join(ordt_map[ord(x)] for x in raw), consumed
|
||||
return raw.decode(codec), consumed
|
||||
|
||||
def decode_hex_number(raw, codec='utf-8'):
|
||||
|
Loading…
x
Reference in New Issue
Block a user