mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speedup detection of character encoding when declaration is absent by using a native code implementation (ccharset)
This commit is contained in:
parent
36ffc2b3cf
commit
1e6702fbc1
@ -679,6 +679,15 @@
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "cchardet",
|
||||
"unix": {
|
||||
"filename": "cchardet-2.1.7.tar.gz",
|
||||
"hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf",
|
||||
"urls": ["pypi"]
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "msgpack",
|
||||
"unix": {
|
||||
|
@ -5,5 +5,5 @@
|
||||
|
||||
set -xe
|
||||
useradd -m ci
|
||||
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip
|
||||
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet
|
||||
chown -R ci:users $GITHUB_WORKSPACE
|
||||
|
@ -102,13 +102,21 @@ def substitute_entites(raw):
|
||||
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
||||
|
||||
|
||||
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis"}
|
||||
_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
|
||||
|
||||
|
||||
def detect(*args, **kwargs):
|
||||
from chardet import detect
|
||||
return detect(*args, **kwargs)
|
||||
def detect(bytestring):
|
||||
try:
|
||||
from cchardet import detect as implementation
|
||||
except ImportError:
|
||||
from chardet import detect as implementation
|
||||
return implementation(bytestring)
|
||||
else:
|
||||
ans = implementation(bytestring)
|
||||
enc = ans.get('encoding')
|
||||
if enc:
|
||||
ans['encoding'] = enc.lower()
|
||||
return ans
|
||||
|
||||
|
||||
def force_encoding(raw, verbose, assume_utf8=False):
|
||||
|
@ -69,10 +69,10 @@ class BuildTest(unittest.TestCase):
|
||||
del CHMFile, chmlib
|
||||
|
||||
def test_chardet(self):
|
||||
from chardet import detect
|
||||
from cchardet import detect
|
||||
raw = 'mūsi Füße'.encode('utf-8')
|
||||
data = detect(raw)
|
||||
self.assertEqual(data['encoding'], 'utf-8')
|
||||
self.assertEqual(data['encoding'].lower(), 'utf-8')
|
||||
self.assertGreater(data['confidence'], 0.5)
|
||||
# The following is used by html5lib
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
Loading…
x
Reference in New Issue
Block a user