From 672d91d454b26f4cbd3c265b6dff15879a4be5da Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 Apr 2011 10:40:58 -0600 Subject: [PATCH] Fix #745428 (Calibre doesn't convert encoding correctly.) --- src/calibre/ebooks/chardet/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index c562176ef2..604cbdd360 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -100,6 +100,12 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, try: if encoding.lower().strip() == 'macintosh': encoding = 'mac-roman' + if encoding.lower().replace('_', '-').strip() in ( + 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', + 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): + # Microsoft Word exports to HTML with encoding incorrectly set to + # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. + encoding = 'gbk' raw = raw.decode(encoding, 'replace') except LookupError: encoding = 'utf-8' @@ -110,11 +116,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, if resolve_entities: raw = substitute_entites(raw) - if encoding and encoding.lower().replace('_', '-').strip() in ( - 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', - 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): - # Microsoft Word exports to HTML with encoding incorrectly set to - # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. - encoding = 'gbk' + return raw, encoding