diff --git a/bypy/sources.json b/bypy/sources.json index 6d91dbf282..3767058255 100644 --- a/bypy/sources.json +++ b/bypy/sources.json @@ -679,6 +679,15 @@ } }, + { + "name": "cchardet", + "unix": { + "filename": "cchardet-2.1.7.tar.gz", + "hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf", + "urls": ["pypi"] + } + }, + { "name": "msgpack", "unix": { diff --git a/setup/arch-ci.sh b/setup/arch-ci.sh index d4ac1c7d6f..6954f7b0d9 100755 --- a/setup/arch-ci.sh +++ b/setup/arch-ci.sh @@ -5,5 +5,5 @@ set -xe useradd -m ci -pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip +pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-dbus python-dnspython python-dukpy python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt5 python-regex python-zeroconf python-pyqtwebengine qt5-x11extras qt5-svg qt5-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet chown -R ci:users $GITHUB_WORKSPACE diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 56bbe69de6..54e4acbac2 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -102,13 +102,21 @@ def substitute_entites(raw): return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) -_CHARSET_ALIASES = {"macintosh" : "mac-roman", - "x-sjis" : "shift-jis"} +_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"} -def detect(*args, **kwargs): - from chardet import detect - return detect(*args, **kwargs) +def detect(bytestring): + try: + from cchardet import detect as implementation + except ImportError: + from chardet import detect as implementation + return implementation(bytestring) + else: + ans = implementation(bytestring) + enc = ans.get('encoding') + if enc: + ans['encoding'] = enc.lower() + return ans def force_encoding(raw, verbose, assume_utf8=False): diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py index b37fb1bcfb..961a86a9dc 100644 --- a/src/calibre/test_build.py +++ b/src/calibre/test_build.py @@ -69,10 +69,10 @@ class BuildTest(unittest.TestCase): del CHMFile, chmlib def test_chardet(self): - from chardet import detect + from cchardet import detect raw = 'mūsi Füße'.encode('utf-8') data = detect(raw) - self.assertEqual(data['encoding'], 'utf-8') + self.assertEqual(data['encoding'].lower(), 'utf-8') self.assertGreater(data['confidence'], 0.5) # The following is used by html5lib from chardet.universaldetector import UniversalDetector