mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Switch from cchardet to uchardet
cchardet is not maintained anymore: https://github.com/PyYoshi/cChardet/issues/77 cchardet is based on uchardet with the addition of reporting encoding detection confidence. We dont really need that, so moving to uchardet is simplest. See #1690 (Low effort port to charset_normalizer)
This commit is contained in:
parent
1e62ba9542
commit
5c3385476f
@ -45,7 +45,7 @@ def binary_includes():
|
||||
get_dll_path,
|
||||
('usb-1.0 mtp expat sqlite3 ffi z lzma openjp2 poppler dbus-1 iconv xml2 xslt jpeg png16'
|
||||
' webp webpmux webpdemux exslt ncursesw readline chm hunspell-1.7 hyphen'
|
||||
' icudata icui18n icuuc icuio stemmer gcrypt gpg-error'
|
||||
' icudata icui18n icuuc icuio stemmer gcrypt gpg-error uchardet'
|
||||
' gobject-2.0 glib-2.0 gthread-2.0 gmodule-2.0 gio-2.0 dbus-glib-1').split()
|
||||
)) + [
|
||||
# debian/ubuntu for for some typical stupid reason use libpcre.so.3
|
||||
|
@ -527,7 +527,7 @@ class Freeze:
|
||||
def add_misc_libraries(self):
|
||||
for x in (
|
||||
'usb-1.0.0', 'mtp.9', 'chm.0', 'sqlite3.0', 'hunspell-1.7.0',
|
||||
'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0',
|
||||
'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0', 'uchardet.0',
|
||||
'stemmer.0', 'xslt.1', 'exslt.0', 'xml2.2', 'z.1', 'unrar', 'lzma.5',
|
||||
'crypto.1.1', 'ssl.1.1', 'iconv.2', # 'ltdl.7'
|
||||
):
|
||||
|
@ -700,11 +700,11 @@
|
||||
},
|
||||
|
||||
{
|
||||
"name": "cchardet",
|
||||
"name": "uchardet",
|
||||
"unix": {
|
||||
"filename": "cchardet-2.1.7.tar.gz",
|
||||
"hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf",
|
||||
"urls": ["pypi"]
|
||||
"filename": "uchardet-0.0.7.tar.xz",
|
||||
"hash": "sha256:3fc79408ae1d84b406922fa9319ce005631c95ca0f34b205fad867e8b30e45b1",
|
||||
"urls": ["https://www.freedesktop.org/software/uchardet/releases/{filename}"]
|
||||
}
|
||||
},
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
set -xe
|
||||
|
||||
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet libstemmer poppler
|
||||
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip uchardet libstemmer poppler
|
||||
|
||||
useradd -m ci
|
||||
chown -R ci:users $GITHUB_WORKSPACE
|
||||
|
@ -130,6 +130,7 @@ hunspell_inc_dirs = []
|
||||
hunspell_lib_dirs = []
|
||||
hyphen_inc_dirs = []
|
||||
hyphen_lib_dirs = []
|
||||
uchardet_inc_dirs, uchardet_lib_dirs, uchardet_libs = [], [], ['uchardet']
|
||||
openssl_inc_dirs, openssl_lib_dirs = [], []
|
||||
ICU = sw = ''
|
||||
|
||||
@ -143,6 +144,8 @@ if iswindows:
|
||||
hyphen_lib_dirs = [sw_lib_dir]
|
||||
openssl_inc_dirs = [sw_inc_dir]
|
||||
openssl_lib_dirs = [sw_lib_dir]
|
||||
uchardet_inc_dirs = [sw_inc_dir]
|
||||
uchardet_lib_dirs = [sw_lib_dir]
|
||||
sqlite_inc_dirs = [sw_inc_dir]
|
||||
chmlib_inc_dirs = [sw_inc_dir]
|
||||
chmlib_lib_dirs = [sw_lib_dir]
|
||||
@ -165,6 +168,7 @@ elif ismacos:
|
||||
podofo_lib = sw_lib_dir
|
||||
ft_libs = ['freetype']
|
||||
ft_inc_dirs = [sw + '/include/freetype2']
|
||||
uchardet_inc_dirs = [sw + '/include/uchardet']
|
||||
SSL = os.environ.get('OPENSSL_DIR', os.path.join(sw, 'private', 'ssl'))
|
||||
openssl_inc_dirs = [os.path.join(SSL, 'include')]
|
||||
openssl_lib_dirs = [os.path.join(SSL, 'lib')]
|
||||
@ -183,6 +187,9 @@ else:
|
||||
if not os.path.exists(podofo_inc + '/podofo.h'):
|
||||
podofo_inc = os.path.join(sw, 'include', 'podofo')
|
||||
podofo_lib = os.path.join(sw, 'lib')
|
||||
uchardet_inc_dirs = pkgconfig_include_dirs('uchardet', '', '/usr/include/uchardet')
|
||||
uchardet_lib_dirs = pkgconfig_lib_dirs('uchardet', '', '/usr/lib')
|
||||
uchardet_libs = pkgconfig_libs('uchardet', '', '')
|
||||
|
||||
|
||||
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
||||
|
@ -16,6 +16,13 @@
|
||||
"lib_dirs": "!hyphen_lib_dirs",
|
||||
"needs_c99": true
|
||||
},
|
||||
{
|
||||
"name": "uchardet",
|
||||
"sources": "calibre/ebooks/uchardet.c",
|
||||
"libraries": "!uchardet_libs",
|
||||
"inc_dirs": "!uchardet_inc_dirs",
|
||||
"lib_dirs": "!uchardet_lib_dirs"
|
||||
},
|
||||
{
|
||||
"name": "unicode_names",
|
||||
"headers": "unicode_names/names.h unicode_names/data-types.h",
|
||||
|
@ -266,6 +266,7 @@ class ExtensionsImporter:
|
||||
'tokenizer',
|
||||
'certgen',
|
||||
'sqlite_extension',
|
||||
'uchardet',
|
||||
)
|
||||
if iswindows:
|
||||
extra = ('winutil', 'wpd', 'winfonts', 'winsapi')
|
||||
|
@ -103,16 +103,18 @@ _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
|
||||
|
||||
|
||||
def detect(bytestring):
|
||||
from cchardet import detect as implementation
|
||||
ans = implementation(bytestring)
|
||||
enc = ans.get('encoding')
|
||||
if enc:
|
||||
ans['encoding'] = enc.lower()
|
||||
elif enc is None:
|
||||
ans['encoding'] = ''
|
||||
if ans.get('confidence') is None:
|
||||
ans['confidence'] = 0
|
||||
return ans
|
||||
if isinstance(bytestring, str):
|
||||
bytestring = bytestring.encode('utf-8', 'replace')
|
||||
try:
|
||||
from calibre_extensions.uchardet import detect as implementation
|
||||
except ImportError:
|
||||
# People running from source without updated binaries
|
||||
from cchardet import detect as cdi
|
||||
|
||||
def implementation(x):
|
||||
return cdi(x).get('encoding') or ''
|
||||
enc = implementation(bytestring).lower()
|
||||
return {'encoding': enc, 'confidence': 1 if enc else 0}
|
||||
|
||||
|
||||
def force_encoding(raw, verbose, assume_utf8=False):
|
||||
|
65
src/calibre/ebooks/uchardet.c
Normal file
65
src/calibre/ebooks/uchardet.c
Normal file
@ -0,0 +1,65 @@
|
||||
/*
|
||||
* uchardet.c
|
||||
* Copyright (C) 2022 Kovid Goyal <kovid at kovidgoyal.net>
|
||||
*
|
||||
* Distributed under terms of the GPL3 license.
|
||||
*/
|
||||
|
||||
#include "Python.h"
|
||||
#include <uchardet.h>
|
||||
|
||||
#define CAPSULE_NAME "uchardet.detector_capsule"
|
||||
#define CAPSULE_ATTR "detector_capsule"
|
||||
|
||||
static PyObject*
|
||||
detect(PyObject *self, PyObject *bytes) {
|
||||
if (!PyBytes_Check(bytes)) { PyErr_SetString(PyExc_TypeError, "a byte string is required"); return NULL; }
|
||||
PyObject *capsule = PyObject_GetAttrString(self, CAPSULE_ATTR);
|
||||
if (!capsule) return NULL;
|
||||
void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
|
||||
if (!d) return NULL;
|
||||
uchardet_reset(d);
|
||||
uchardet_handle_data(d, PyBytes_AS_STRING(bytes), (size_t)PyBytes_GET_SIZE(bytes));
|
||||
uchardet_data_end(d);
|
||||
return PyUnicode_FromString(uchardet_get_charset(d));
|
||||
}
|
||||
|
||||
static PyMethodDef methods[] = {
|
||||
{"detect", detect, METH_O,
|
||||
"detect(bytestring) -> encoding name\n\n"
|
||||
"Detect the encoding of the specified bytestring"
|
||||
},
|
||||
{NULL, NULL, 0, NULL}
|
||||
};
|
||||
|
||||
|
||||
static void
|
||||
free_detector(PyObject *capsule) {
|
||||
void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
|
||||
if (d) uchardet_delete(d);
|
||||
}
|
||||
|
||||
static int
|
||||
exec_module(PyObject *module) {
|
||||
uchardet_t detector = uchardet_new();
|
||||
if (!detector) { PyErr_NoMemory(); return -1; }
|
||||
PyObject *detector_capsule = PyCapsule_New(detector, CAPSULE_NAME, free_detector);
|
||||
if (!detector_capsule) return -1;
|
||||
int ret = PyModule_AddObjectRef(module, CAPSULE_ATTR, detector_capsule);
|
||||
Py_DECREF(detector_capsule);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static PyModuleDef_Slot slots[] = { {Py_mod_exec, exec_module}, {0, NULL} };
|
||||
|
||||
static struct PyModuleDef module_def = {
|
||||
.m_base = PyModuleDef_HEAD_INIT,
|
||||
.m_name = "uchardet",
|
||||
.m_doc = "Detect the encoding of bytestring",
|
||||
.m_methods = methods,
|
||||
.m_slots = slots,
|
||||
};
|
||||
|
||||
CALIBRE_MODINIT_FUNC PyInit_uchardet(void) {
|
||||
return PyModuleDef_Init(&module_def);
|
||||
}
|
@ -73,11 +73,10 @@ class BuildTest(unittest.TestCase):
|
||||
del CHMFile, chmlib
|
||||
|
||||
def test_chardet(self):
|
||||
from cchardet import detect
|
||||
from calibre_extensions.uchardet import detect
|
||||
raw = 'mūsi Füße'.encode()
|
||||
data = detect(raw)
|
||||
self.assertEqual(data['encoding'].lower(), 'utf-8')
|
||||
self.assertGreater(data['confidence'], 0.5)
|
||||
enc = detect(raw).lower()
|
||||
self.assertEqual(enc, 'utf-8')
|
||||
# The following is used by html5lib
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
detector = UniversalDetector()
|
||||
|
Loading…
x
Reference in New Issue
Block a user