TXT Input: When detecting the encoding of txt files only use the first four kilobytes of text. Fixes excessively slow conversion of very large text files. See #1668246 (Txt file to mobi file, more than 19M dead loop.)

This commit is contained in:
Kovid Goyal 2017-03-08 09:53:34 +05:30
parent 3a515c1db6
commit c6eaede439

View File

@ -110,7 +110,7 @@ class TXTInput(InputFormatPlugin):
ienc = options.input_encoding ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc) log.debug('Using user specified input encoding of %s' % ienc)
else: else:
det_encoding = detect(txt) det_encoding = detect(txt[:4096])
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence'] det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',