mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speedup detection of character encoding when declaration is absent by using a native code implementation (ccharset)
This commit is contained in:
parent
36ffc2b3cf
commit
1e6702fbc1
@ -679,6 +679,15 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "cchardet",
|
||||||
|
"unix": {
|
||||||
|
"filename": "cchardet-2.1.7.tar.gz",
|
||||||
|
"hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf",
|
||||||
|
"urls": ["pypi"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "msgpack",
|
"name": "msgpack",
|
||||||
"unix": {
|
"unix": {
|
||||||
|
@ -5,5 +5,5 @@
|
|||||||
|
|
||||||
set -xe
|
set -xe
|
||||||
useradd -m ci
|
useradd -m ci
|
||||||
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip
|
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet
|
||||||
chown -R ci:users $GITHUB_WORKSPACE
|
chown -R ci:users $GITHUB_WORKSPACE
|
||||||
|
@ -102,13 +102,21 @@ def substitute_entites(raw):
|
|||||||
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
||||||
|
|
||||||
|
|
||||||
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
|
_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
|
||||||
"x-sjis" : "shift-jis"}
|
|
||||||
|
|
||||||
|
|
||||||
def detect(*args, **kwargs):
|
def detect(bytestring):
|
||||||
from chardet import detect
|
try:
|
||||||
return detect(*args, **kwargs)
|
from cchardet import detect as implementation
|
||||||
|
except ImportError:
|
||||||
|
from chardet import detect as implementation
|
||||||
|
return implementation(bytestring)
|
||||||
|
else:
|
||||||
|
ans = implementation(bytestring)
|
||||||
|
enc = ans.get('encoding')
|
||||||
|
if enc:
|
||||||
|
ans['encoding'] = enc.lower()
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def force_encoding(raw, verbose, assume_utf8=False):
|
def force_encoding(raw, verbose, assume_utf8=False):
|
||||||
|
@ -69,10 +69,10 @@ class BuildTest(unittest.TestCase):
|
|||||||
del CHMFile, chmlib
|
del CHMFile, chmlib
|
||||||
|
|
||||||
def test_chardet(self):
|
def test_chardet(self):
|
||||||
from chardet import detect
|
from cchardet import detect
|
||||||
raw = 'mūsi Füße'.encode('utf-8')
|
raw = 'mūsi Füße'.encode('utf-8')
|
||||||
data = detect(raw)
|
data = detect(raw)
|
||||||
self.assertEqual(data['encoding'], 'utf-8')
|
self.assertEqual(data['encoding'].lower(), 'utf-8')
|
||||||
self.assertGreater(data['confidence'], 0.5)
|
self.assertGreater(data['confidence'], 0.5)
|
||||||
# The following is used by html5lib
|
# The following is used by html5lib
|
||||||
from chardet.universaldetector import UniversalDetector
|
from chardet.universaldetector import UniversalDetector
|
||||||
|
Loading…
x
Reference in New Issue
Block a user