Speedup detection of character encoding when declaration is absent by using a native code implementation (ccharset)

This commit is contained in:
Kovid Goyal 2021-02-25 15:02:48 +05:30
parent 36ffc2b3cf
commit 1e6702fbc1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 25 additions and 8 deletions

View File

@ -679,6 +679,15 @@
} }
}, },
{
"name": "cchardet",
"unix": {
"filename": "cchardet-2.1.7.tar.gz",
"hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf",
"urls": ["pypi"]
}
},
{ {
"name": "msgpack", "name": "msgpack",
"unix": { "unix": {

View File

@ -5,5 +5,5 @@
set -xe set -xe
useradd -m ci useradd -m ci
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet
chown -R ci:users $GITHUB_WORKSPACE chown -R ci:users $GITHUB_WORKSPACE

View File

@ -102,13 +102,21 @@ def substitute_entites(raw):
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = {"macintosh" : "mac-roman", _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
"x-sjis" : "shift-jis"}
def detect(*args, **kwargs): def detect(bytestring):
from chardet import detect try:
return detect(*args, **kwargs) from cchardet import detect as implementation
except ImportError:
from chardet import detect as implementation
return implementation(bytestring)
else:
ans = implementation(bytestring)
enc = ans.get('encoding')
if enc:
ans['encoding'] = enc.lower()
return ans
def force_encoding(raw, verbose, assume_utf8=False): def force_encoding(raw, verbose, assume_utf8=False):

View File

@ -69,10 +69,10 @@ class BuildTest(unittest.TestCase):
del CHMFile, chmlib del CHMFile, chmlib
def test_chardet(self): def test_chardet(self):
from chardet import detect from cchardet import detect
raw = 'mūsi Füße'.encode('utf-8') raw = 'mūsi Füße'.encode('utf-8')
data = detect(raw) data = detect(raw)
self.assertEqual(data['encoding'], 'utf-8') self.assertEqual(data['encoding'].lower(), 'utf-8')
self.assertGreater(data['confidence'], 0.5) self.assertGreater(data['confidence'], 0.5)
# The following is used by html5lib # The following is used by html5lib
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector