Switch from cchardet to uchardet

cchardet is not maintained anymore: https://github.com/PyYoshi/cChardet/issues/77

cchardet is based on uchardet with the addition of reporting encoding
detection confidence. We dont really need that, so moving to uchardet is
simplest.

See #1690 (Low effort port to charset_normalizer)
This commit is contained in:
Kovid Goyal 2022-07-16 15:04:38 +05:30
parent 1e62ba9542
commit 5c3385476f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
10 changed files with 102 additions and 21 deletions

View File

@ -45,7 +45,7 @@ def binary_includes():
get_dll_path, get_dll_path,
('usb-1.0 mtp expat sqlite3 ffi z lzma openjp2 poppler dbus-1 iconv xml2 xslt jpeg png16' ('usb-1.0 mtp expat sqlite3 ffi z lzma openjp2 poppler dbus-1 iconv xml2 xslt jpeg png16'
' webp webpmux webpdemux exslt ncursesw readline chm hunspell-1.7 hyphen' ' webp webpmux webpdemux exslt ncursesw readline chm hunspell-1.7 hyphen'
' icudata icui18n icuuc icuio stemmer gcrypt gpg-error' ' icudata icui18n icuuc icuio stemmer gcrypt gpg-error uchardet'
' gobject-2.0 glib-2.0 gthread-2.0 gmodule-2.0 gio-2.0 dbus-glib-1').split() ' gobject-2.0 glib-2.0 gthread-2.0 gmodule-2.0 gio-2.0 dbus-glib-1').split()
)) + [ )) + [
# debian/ubuntu for for some typical stupid reason use libpcre.so.3 # debian/ubuntu for for some typical stupid reason use libpcre.so.3

View File

@ -527,7 +527,7 @@ class Freeze:
def add_misc_libraries(self): def add_misc_libraries(self):
for x in ( for x in (
'usb-1.0.0', 'mtp.9', 'chm.0', 'sqlite3.0', 'hunspell-1.7.0', 'usb-1.0.0', 'mtp.9', 'chm.0', 'sqlite3.0', 'hunspell-1.7.0',
'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0', 'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0', 'uchardet.0',
'stemmer.0', 'xslt.1', 'exslt.0', 'xml2.2', 'z.1', 'unrar', 'lzma.5', 'stemmer.0', 'xslt.1', 'exslt.0', 'xml2.2', 'z.1', 'unrar', 'lzma.5',
'crypto.1.1', 'ssl.1.1', 'iconv.2', # 'ltdl.7' 'crypto.1.1', 'ssl.1.1', 'iconv.2', # 'ltdl.7'
): ):

View File

@ -700,11 +700,11 @@
}, },
{ {
"name": "cchardet", "name": "uchardet",
"unix": { "unix": {
"filename": "cchardet-2.1.7.tar.gz", "filename": "uchardet-0.0.7.tar.xz",
"hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf", "hash": "sha256:3fc79408ae1d84b406922fa9319ce005631c95ca0f34b205fad867e8b30e45b1",
"urls": ["pypi"] "urls": ["https://www.freedesktop.org/software/uchardet/releases/{filename}"]
} }
}, },

View File

@ -5,7 +5,7 @@
set -xe set -xe
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet libstemmer poppler pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip uchardet libstemmer poppler
useradd -m ci useradd -m ci
chown -R ci:users $GITHUB_WORKSPACE chown -R ci:users $GITHUB_WORKSPACE

View File

@ -130,6 +130,7 @@ hunspell_inc_dirs = []
hunspell_lib_dirs = [] hunspell_lib_dirs = []
hyphen_inc_dirs = [] hyphen_inc_dirs = []
hyphen_lib_dirs = [] hyphen_lib_dirs = []
uchardet_inc_dirs, uchardet_lib_dirs, uchardet_libs = [], [], ['uchardet']
openssl_inc_dirs, openssl_lib_dirs = [], [] openssl_inc_dirs, openssl_lib_dirs = [], []
ICU = sw = '' ICU = sw = ''
@ -143,6 +144,8 @@ if iswindows:
hyphen_lib_dirs = [sw_lib_dir] hyphen_lib_dirs = [sw_lib_dir]
openssl_inc_dirs = [sw_inc_dir] openssl_inc_dirs = [sw_inc_dir]
openssl_lib_dirs = [sw_lib_dir] openssl_lib_dirs = [sw_lib_dir]
uchardet_inc_dirs = [sw_inc_dir]
uchardet_lib_dirs = [sw_lib_dir]
sqlite_inc_dirs = [sw_inc_dir] sqlite_inc_dirs = [sw_inc_dir]
chmlib_inc_dirs = [sw_inc_dir] chmlib_inc_dirs = [sw_inc_dir]
chmlib_lib_dirs = [sw_lib_dir] chmlib_lib_dirs = [sw_lib_dir]
@ -165,6 +168,7 @@ elif ismacos:
podofo_lib = sw_lib_dir podofo_lib = sw_lib_dir
ft_libs = ['freetype'] ft_libs = ['freetype']
ft_inc_dirs = [sw + '/include/freetype2'] ft_inc_dirs = [sw + '/include/freetype2']
uchardet_inc_dirs = [sw + '/include/uchardet']
SSL = os.environ.get('OPENSSL_DIR', os.path.join(sw, 'private', 'ssl')) SSL = os.environ.get('OPENSSL_DIR', os.path.join(sw, 'private', 'ssl'))
openssl_inc_dirs = [os.path.join(SSL, 'include')] openssl_inc_dirs = [os.path.join(SSL, 'include')]
openssl_lib_dirs = [os.path.join(SSL, 'lib')] openssl_lib_dirs = [os.path.join(SSL, 'lib')]
@ -183,6 +187,9 @@ else:
if not os.path.exists(podofo_inc + '/podofo.h'): if not os.path.exists(podofo_inc + '/podofo.h'):
podofo_inc = os.path.join(sw, 'include', 'podofo') podofo_inc = os.path.join(sw, 'include', 'podofo')
podofo_lib = os.path.join(sw, 'lib') podofo_lib = os.path.join(sw, 'lib')
uchardet_inc_dirs = pkgconfig_include_dirs('uchardet', '', '/usr/include/uchardet')
uchardet_lib_dirs = pkgconfig_lib_dirs('uchardet', '', '/usr/lib')
uchardet_libs = pkgconfig_libs('uchardet', '', '')
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib) podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)

View File

@ -16,6 +16,13 @@
"lib_dirs": "!hyphen_lib_dirs", "lib_dirs": "!hyphen_lib_dirs",
"needs_c99": true "needs_c99": true
}, },
{
"name": "uchardet",
"sources": "calibre/ebooks/uchardet.c",
"libraries": "!uchardet_libs",
"inc_dirs": "!uchardet_inc_dirs",
"lib_dirs": "!uchardet_lib_dirs"
},
{ {
"name": "unicode_names", "name": "unicode_names",
"headers": "unicode_names/names.h unicode_names/data-types.h", "headers": "unicode_names/names.h unicode_names/data-types.h",

View File

@ -266,6 +266,7 @@ class ExtensionsImporter:
'tokenizer', 'tokenizer',
'certgen', 'certgen',
'sqlite_extension', 'sqlite_extension',
'uchardet',
) )
if iswindows: if iswindows:
extra = ('winutil', 'wpd', 'winfonts', 'winsapi') extra = ('winutil', 'wpd', 'winfonts', 'winsapi')

View File

@ -103,16 +103,18 @@ _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
def detect(bytestring): def detect(bytestring):
from cchardet import detect as implementation if isinstance(bytestring, str):
ans = implementation(bytestring) bytestring = bytestring.encode('utf-8', 'replace')
enc = ans.get('encoding') try:
if enc: from calibre_extensions.uchardet import detect as implementation
ans['encoding'] = enc.lower() except ImportError:
elif enc is None: # People running from source without updated binaries
ans['encoding'] = '' from cchardet import detect as cdi
if ans.get('confidence') is None:
ans['confidence'] = 0 def implementation(x):
return ans return cdi(x).get('encoding') or ''
enc = implementation(bytestring).lower()
return {'encoding': enc, 'confidence': 1 if enc else 0}
def force_encoding(raw, verbose, assume_utf8=False): def force_encoding(raw, verbose, assume_utf8=False):

View File

@ -0,0 +1,65 @@
/*
* uchardet.c
* Copyright (C) 2022 Kovid Goyal <kovid at kovidgoyal.net>
*
* Distributed under terms of the GPL3 license.
*/
#include "Python.h"
#include <uchardet.h>
#define CAPSULE_NAME "uchardet.detector_capsule"
#define CAPSULE_ATTR "detector_capsule"
static PyObject*
detect(PyObject *self, PyObject *bytes) {
if (!PyBytes_Check(bytes)) { PyErr_SetString(PyExc_TypeError, "a byte string is required"); return NULL; }
PyObject *capsule = PyObject_GetAttrString(self, CAPSULE_ATTR);
if (!capsule) return NULL;
void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
if (!d) return NULL;
uchardet_reset(d);
uchardet_handle_data(d, PyBytes_AS_STRING(bytes), (size_t)PyBytes_GET_SIZE(bytes));
uchardet_data_end(d);
return PyUnicode_FromString(uchardet_get_charset(d));
}
static PyMethodDef methods[] = {
{"detect", detect, METH_O,
"detect(bytestring) -> encoding name\n\n"
"Detect the encoding of the specified bytestring"
},
{NULL, NULL, 0, NULL}
};
static void
free_detector(PyObject *capsule) {
void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
if (d) uchardet_delete(d);
}
static int
exec_module(PyObject *module) {
uchardet_t detector = uchardet_new();
if (!detector) { PyErr_NoMemory(); return -1; }
PyObject *detector_capsule = PyCapsule_New(detector, CAPSULE_NAME, free_detector);
if (!detector_capsule) return -1;
int ret = PyModule_AddObjectRef(module, CAPSULE_ATTR, detector_capsule);
Py_DECREF(detector_capsule);
return ret;
}
static PyModuleDef_Slot slots[] = { {Py_mod_exec, exec_module}, {0, NULL} };
static struct PyModuleDef module_def = {
.m_base = PyModuleDef_HEAD_INIT,
.m_name = "uchardet",
.m_doc = "Detect the encoding of bytestring",
.m_methods = methods,
.m_slots = slots,
};
CALIBRE_MODINIT_FUNC PyInit_uchardet(void) {
return PyModuleDef_Init(&module_def);
}

View File

@ -73,11 +73,10 @@ class BuildTest(unittest.TestCase):
del CHMFile, chmlib del CHMFile, chmlib
def test_chardet(self): def test_chardet(self):
from cchardet import detect from calibre_extensions.uchardet import detect
raw = 'mūsi Füße'.encode() raw = 'mūsi Füße'.encode()
data = detect(raw) enc = detect(raw).lower()
self.assertEqual(data['encoding'].lower(), 'utf-8') self.assertEqual(enc, 'utf-8')
self.assertGreater(data['confidence'], 0.5)
# The following is used by html5lib # The following is used by html5lib
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
detector = UniversalDetector() detector = UniversalDetector()