mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Txt Input: Use the gbk encoding for txt files with detected encoding of gb2312. Fixes #1175974 (Simplifed Chinese text file charset detection problem: GBK text files detected as GB2312)
This commit is contained in:
parent
2f7a442c63
commit
8e8efd61a9
@ -63,7 +63,6 @@ class TXTInput(InputFormatPlugin):
|
||||
normalize_line_endings, convert_textile, remove_indents,
|
||||
block_to_single_line, separate_hard_scene_breaks)
|
||||
|
||||
|
||||
self.log = log
|
||||
txt = ''
|
||||
log.debug('Reading text from file...')
|
||||
@ -92,6 +91,12 @@ class TXTInput(InputFormatPlugin):
|
||||
log.debug('Using user specified input encoding of %s' % ienc)
|
||||
else:
|
||||
det_encoding = detect(txt)
|
||||
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
|
||||
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
||||
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
||||
# Microsoft Word exports to HTML with encoding incorrectly set to
|
||||
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
||||
det_encoding = 'gbk'
|
||||
ienc = det_encoding['encoding']
|
||||
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100))
|
||||
if not ienc:
|
||||
|
Loading…
x
Reference in New Issue
Block a user