mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8 Input: Support KF8 format Amazon samples. Fixes #963418 (UnicodeDecodeError invalid start byte when displaying KF8 ebook)
This commit is contained in:
parent
aba3f4686f
commit
23f9bdc7c9
@ -39,10 +39,43 @@ def parse_indx_header(data):
|
|||||||
words = (
|
words = (
|
||||||
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||||
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
|
||||||
)
|
) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
|
||||||
|
'ordt1', 'ordt2', 'tagx')
|
||||||
num = len(words)
|
num = len(words)
|
||||||
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
|
||||||
return dict(zip(words, values))
|
ans = dict(zip(words, values))
|
||||||
|
ordt1, ordt2 = ans['ordt1'], ans['ordt2']
|
||||||
|
ans['ordt1_raw'], ans['ordt2_raw'] = [], []
|
||||||
|
ans['ordt_map'] = ''
|
||||||
|
|
||||||
|
if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
|
||||||
|
# I dont know what this is, but using it seems to be unnecessary, so
|
||||||
|
# just leave it as the raw bytestring
|
||||||
|
ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
|
||||||
|
if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
|
||||||
|
ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
|
||||||
|
if ans['code'] == 65002:
|
||||||
|
# This appears to be EBCDIC-UTF (65002) encoded. I can't be
|
||||||
|
# bothered to write a decoder for this (see
|
||||||
|
# http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
|
||||||
|
# Instead I use a half assed decoder that decodes only the ascii
|
||||||
|
# valid values correctly. Hopefully these ORDT sections will only
|
||||||
|
# ever be used in SKEL and ELEM indices where the text is pure
|
||||||
|
# ASCII. EBCDIC-UTF and ASCII have the same. Any non ASCII valid
|
||||||
|
# values are mapped to the ? character
|
||||||
|
|
||||||
|
parsed = bytearray(ans['oentries'])
|
||||||
|
for i in xrange(0, 2*ans['oentries'], 2):
|
||||||
|
if 0x20 < raw[i+1] < 0x7f:
|
||||||
|
parsed[i//2] = raw[i+1]
|
||||||
|
else:
|
||||||
|
parsed[i//2] = ord(b'?')
|
||||||
|
ans['ordt_map'] = bytes(parsed).decode('ascii')
|
||||||
|
else:
|
||||||
|
ans['ordt_map'] = '?'*ans['oentries']
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
class CNCX(object): # {{{
|
class CNCX(object): # {{{
|
||||||
|
|
||||||
@ -163,7 +196,7 @@ def get_tag_map(control_byte_count, tagx, data, strict=False):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def parse_index_record(table, data, control_byte_count, tags, codec,
|
def parse_index_record(table, data, control_byte_count, tags, codec,
|
||||||
strict=False):
|
ordt_map, strict=False):
|
||||||
header = parse_indx_header(data)
|
header = parse_indx_header(data)
|
||||||
idxt_pos = header['start']
|
idxt_pos = header['start']
|
||||||
if data[idxt_pos:idxt_pos+4] != b'IDXT':
|
if data[idxt_pos:idxt_pos+4] != b'IDXT':
|
||||||
@ -184,12 +217,11 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
|
|||||||
for j in xrange(entry_count):
|
for j in xrange(entry_count):
|
||||||
start, end = idx_positions[j:j+2]
|
start, end = idx_positions[j:j+2]
|
||||||
rec = data[start:end]
|
rec = data[start:end]
|
||||||
ident, consumed = decode_string(rec, codec=codec)
|
ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
|
||||||
rec = rec[consumed:]
|
rec = rec[consumed:]
|
||||||
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
|
tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
|
||||||
table[ident] = tag_map
|
table[ident] = tag_map
|
||||||
|
|
||||||
|
|
||||||
def read_index(sections, idx, codec):
|
def read_index(sections, idx, codec):
|
||||||
table, cncx = OrderedDict(), CNCX([], codec)
|
table, cncx = OrderedDict(), CNCX([], codec)
|
||||||
|
|
||||||
@ -203,12 +235,13 @@ def read_index(sections, idx, codec):
|
|||||||
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
|
cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
|
||||||
cncx = CNCX(cncx_records, codec)
|
cncx = CNCX(cncx_records, codec)
|
||||||
|
|
||||||
tag_section_start = indx_header['len']
|
tag_section_start = indx_header['tagx']
|
||||||
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
|
control_byte_count, tags = parse_tagx_section(data[tag_section_start:])
|
||||||
|
|
||||||
for i in xrange(idx + 1, idx + 1 + indx_count):
|
for i in xrange(idx + 1, idx + 1 + indx_count):
|
||||||
# Index record
|
# Index record
|
||||||
data = sections[i][0]
|
data = sections[i][0]
|
||||||
parse_index_record(table, data, control_byte_count, tags, codec)
|
parse_index_record(table, data, control_byte_count, tags, codec,
|
||||||
|
indx_header['ordt_map'])
|
||||||
return table, cncx
|
return table, cncx
|
||||||
|
|
||||||
|
@ -285,7 +285,11 @@ class Mobi8Reader(object):
|
|||||||
def create_guide(self):
|
def create_guide(self):
|
||||||
guide = Guide()
|
guide = Guide()
|
||||||
for ref_type, ref_title, fileno in self.guide:
|
for ref_type, ref_title, fileno in self.guide:
|
||||||
elem = self.elems[fileno]
|
try:
|
||||||
|
elem = self.elems[fileno]
|
||||||
|
except IndexError:
|
||||||
|
# Happens for thumbnailstandard in Amazon book samples
|
||||||
|
continue
|
||||||
fi = self.get_file_info(elem.insert_pos)
|
fi = self.get_file_info(elem.insert_pos)
|
||||||
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
|
idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
|
||||||
linktgt = fi.filename
|
linktgt = fi.filename
|
||||||
|
@ -15,10 +15,12 @@ from calibre.ebooks import normalize
|
|||||||
|
|
||||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||||
|
|
||||||
def decode_string(raw, codec='utf-8'):
|
def decode_string(raw, codec='utf-8', ordt_map=''):
|
||||||
length, = struct.unpack(b'>B', raw[0])
|
length, = struct.unpack(b'>B', raw[0])
|
||||||
raw = raw[1:1+length]
|
raw = raw[1:1+length]
|
||||||
consumed = length+1
|
consumed = length+1
|
||||||
|
if ordt_map:
|
||||||
|
return ''.join(ordt_map[ord(x)] for x in raw), consumed
|
||||||
return raw.decode(codec), consumed
|
return raw.decode(codec), consumed
|
||||||
|
|
||||||
def decode_hex_number(raw, codec='utf-8'):
|
def decode_hex_number(raw, codec='utf-8'):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user