Handle spurious encoding match when detecting encoding for diff

This commit is contained in:
Kovid Goyal 2014-12-14 12:35:53 +05:30
parent 2c9ca9ea7e
commit cf01a5b969

View File

@ -108,9 +108,11 @@ def get_decoded_raw(name):
if syntax in {'html', 'xml'}: if syntax in {'html', 'xml'}:
raw = xml_to_unicode(raw, verbose=True)[0] raw = xml_to_unicode(raw, verbose=True)[0]
else: else:
m = re.search(r"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I) m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I)
if m is not None and m.group(1) != '8bit': if m is not None and m.group(1) != '8bit':
enc = m.group(1) enc = m.group(1)
if enc == b'unicode':
enc = 'utf-8'
else: else:
enc = force_encoding(raw, verbose=True) enc = force_encoding(raw, verbose=True)
try: try: