Txt Input: Use the gbk encoding for txt files with detected encoding of gb2312. Fixes #1175974 (Simplifed Chinese text file charset detection problem: GBK text files detected as GB2312)

This commit is contained in:
Kovid Goyal 2013-05-03 18:12:19 +05:30
parent 2f7a442c63
commit 8e8efd61a9

View File

@ -63,7 +63,6 @@ class TXTInput(InputFormatPlugin):
normalize_line_endings, convert_textile, remove_indents, normalize_line_endings, convert_textile, remove_indents,
block_to_single_line, separate_hard_scene_breaks) block_to_single_line, separate_hard_scene_breaks)
self.log = log self.log = log
txt = '' txt = ''
log.debug('Reading text from file...') log.debug('Reading text from file...')
@ -92,6 +91,12 @@ class TXTInput(InputFormatPlugin):
log.debug('Using user specified input encoding of %s' % ienc) log.debug('Using user specified input encoding of %s' % ienc)
else: else:
det_encoding = detect(txt) det_encoding = detect(txt)
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
det_encoding = 'gbk'
ienc = det_encoding['encoding'] ienc = det_encoding['encoding']
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100)) log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100))
if not ienc: if not ienc: