diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index 50f1409ea6..046b362ae7 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -63,7 +63,6 @@ class TXTInput(InputFormatPlugin): normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) - self.log = log txt = '' log.debug('Reading text from file...') @@ -92,6 +91,12 @@ class TXTInput(InputFormatPlugin): log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt) + if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( + 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', + 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): + # Microsoft Word exports to HTML with encoding incorrectly set to + # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. + det_encoding = 'gbk' ienc = det_encoding['encoding'] log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100)) if not ienc: