Fix #745428 (Calibre doesn't convert encoding correctly.)

This commit is contained in:
Kovid Goyal 2011-04-02 10:40:58 -06:00
parent 777c5ec251
commit 672d91d454

View File

@ -100,6 +100,12 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
try: try:
if encoding.lower().strip() == 'macintosh': if encoding.lower().strip() == 'macintosh':
encoding = 'mac-roman' encoding = 'mac-roman'
if encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
raw = raw.decode(encoding, 'replace') raw = raw.decode(encoding, 'replace')
except LookupError: except LookupError:
encoding = 'utf-8' encoding = 'utf-8'
@ -110,11 +116,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
if resolve_entities: if resolve_entities:
raw = substitute_entites(raw) raw = substitute_entites(raw)
if encoding and encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
return raw, encoding return raw, encoding