This commit is contained in:
Kovid Goyal 2021-03-05 19:38:18 +05:30
parent f7d80b53e6
commit 5d126692f8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, codecs import re, codecs, sys
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
_encoding_pats = ( _encoding_pats = (
@ -106,32 +106,30 @@ _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
def detect(bytestring): def detect(bytestring):
try: from cchardet import detect as implementation
from cchardet import detect as implementation ans = implementation(bytestring)
except ImportError: enc = ans.get('encoding')
from chardet import detect as implementation if enc:
return implementation(bytestring) ans['encoding'] = enc.lower()
else: elif enc is None:
ans = implementation(bytestring) ans['encoding'] = ''
enc = ans.get('encoding') if ans.get('confidence') is None:
if enc: ans['confidence'] = 0
ans['encoding'] = enc.lower() return ans
return ans
def force_encoding(raw, verbose, assume_utf8=False): def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
try: try:
chardet = detect(raw[:1024*50]) chardet = detect(raw[:1024*50])
except: except Exception:
chardet = {'encoding':preferred_encoding, 'confidence':0} chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding'] encoding = chardet['encoding']
if chardet['confidence'] < 1 and assume_utf8: if chardet['confidence'] < 1:
encoding = 'utf-8' if verbose:
if chardet['confidence'] < 1 and verbose: print(f'WARNING: Encoding detection confidence for {chardet["encoding"]} is {chardet["confidence"]}', file=sys.stderr)
print('WARNING: Encoding detection confidence for %s is %d%%'%( if assume_utf8:
chardet['encoding'], chardet['confidence']*100)) encoding = 'utf-8'
if not encoding: if not encoding:
encoding = preferred_encoding encoding = preferred_encoding
encoding = encoding.lower() encoding = encoding.lower()