Use latin-1 instead of utf-8 for default encoding.

This commit is contained in:
John Schember 2011-04-17 10:03:50 -04:00
parent 93492a9ec8
commit 87bb34d994

View File

@ -208,7 +208,7 @@ class SectionMetadata(object):
''' '''
def __init__(self, raw): def __init__(self, raw):
self.default_encoding = 'utf-8' self.default_encoding = 'latin-1'
self.exceptional_uid_encodings = {} self.exceptional_uid_encodings = {}
self.owner_id = None self.owner_id = None
@ -222,14 +222,14 @@ class SectionMetadata(object):
# CharSet # CharSet
if type == 1: if type == 1:
val, = struct.unpack('>H', raw[6+adv:8+adv]) val, = struct.unpack('>H', raw[6+adv:8+adv])
self.default_encoding = MIBNUM_TO_NAME.get(val, 'utf-8') self.default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
# ExceptionalCharSets # ExceptionalCharSets
elif type == 2: elif type == 2:
ii_adv = 0 ii_adv = 0
for ii in xrange(length / 2): for ii in xrange(length / 2):
uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv]) uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv]) mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'utf-8') self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'latin-1')
ii_adv += 4 ii_adv += 4
# OwnerID # OwnerID
elif type == 3: elif type == 3:
@ -306,7 +306,7 @@ class Reader(FormatReader):
self.uid_image_section_number = {} self.uid_image_section_number = {}
self.uid_composite_image_section_number = {} self.uid_composite_image_section_number = {}
self.metadata_section_number = None self.metadata_section_number = None
self.default_encoding = 'utf-8' self.default_encoding = 'latin-1'
self.owner_id = None self.owner_id = None
self.sections = [] self.sections = []
@ -680,10 +680,12 @@ class Reader(FormatReader):
# 3 Bytes # 3 Bytes
# alternate text length, 16-bit unicode character # alternate text length, 16-bit unicode character
elif c == 0x83: elif c == 0x83:
#offset += 2 #offset += 1
#alt_len = struct.unpack('>B', str(d[offset]))[0]
#offset += 1
#c16 = d[offset:offset+2] #c16 = d[offset:offset+2]
#html += c16.decode('utf-16') #html += c16.decode('utf-16')
#offset += 1 #offset += 1 + alt_len
offset += 3 offset += 3
# 32-bit Unicode character # 32-bit Unicode character
# 5 Bytes # 5 Bytes