Speedup detection of character encoding when declaration is absent by using a native code implementation (ccharset)

This commit is contained in:
Kovid Goyal 2021-02-25 15:02:48 +05:30
parent 36ffc2b3cf
commit 1e6702fbc1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 25 additions and 8 deletions

View File

@ -679,6 +679,15 @@
}
},
{
"name": "cchardet",
"unix": {
"filename": "cchardet-2.1.7.tar.gz",
"hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf",
"urls": ["pypi"]
}
},
{
"name": "msgpack",
"unix": {

View File

@ -5,5 +5,5 @@
set -xe
useradd -m ci
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet
chown -R ci:users $GITHUB_WORKSPACE

View File

@ -102,13 +102,21 @@ def substitute_entites(raw):
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
"x-sjis" : "shift-jis"}
_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
def detect(*args, **kwargs):
from chardet import detect
return detect(*args, **kwargs)
def detect(bytestring):
try:
from cchardet import detect as implementation
except ImportError:
from chardet import detect as implementation
return implementation(bytestring)
else:
ans = implementation(bytestring)
enc = ans.get('encoding')
if enc:
ans['encoding'] = enc.lower()
return ans
def force_encoding(raw, verbose, assume_utf8=False):

View File

@ -69,10 +69,10 @@ class BuildTest(unittest.TestCase):
del CHMFile, chmlib
def test_chardet(self):
from chardet import detect
from cchardet import detect
raw = 'mūsi Füße'.encode('utf-8')
data = detect(raw)
self.assertEqual(data['encoding'], 'utf-8')
self.assertEqual(data['encoding'].lower(), 'utf-8')
self.assertGreater(data['confidence'], 0.5)
# The following is used by html5lib
from chardet.universaldetector import UniversalDetector