mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Switch from cchardet to uchardet
cchardet is not maintained anymore: https://github.com/PyYoshi/cChardet/issues/77 cchardet is based on uchardet with the addition of reporting encoding detection confidence. We dont really need that, so moving to uchardet is simplest. See #1690 (Low effort port to charset_normalizer)
This commit is contained in:
parent
1e62ba9542
commit
5c3385476f
@ -45,7 +45,7 @@ def binary_includes():
|
|||||||
get_dll_path,
|
get_dll_path,
|
||||||
('usb-1.0 mtp expat sqlite3 ffi z lzma openjp2 poppler dbus-1 iconv xml2 xslt jpeg png16'
|
('usb-1.0 mtp expat sqlite3 ffi z lzma openjp2 poppler dbus-1 iconv xml2 xslt jpeg png16'
|
||||||
' webp webpmux webpdemux exslt ncursesw readline chm hunspell-1.7 hyphen'
|
' webp webpmux webpdemux exslt ncursesw readline chm hunspell-1.7 hyphen'
|
||||||
' icudata icui18n icuuc icuio stemmer gcrypt gpg-error'
|
' icudata icui18n icuuc icuio stemmer gcrypt gpg-error uchardet'
|
||||||
' gobject-2.0 glib-2.0 gthread-2.0 gmodule-2.0 gio-2.0 dbus-glib-1').split()
|
' gobject-2.0 glib-2.0 gthread-2.0 gmodule-2.0 gio-2.0 dbus-glib-1').split()
|
||||||
)) + [
|
)) + [
|
||||||
# debian/ubuntu for for some typical stupid reason use libpcre.so.3
|
# debian/ubuntu for for some typical stupid reason use libpcre.so.3
|
||||||
|
@ -527,7 +527,7 @@ class Freeze:
|
|||||||
def add_misc_libraries(self):
|
def add_misc_libraries(self):
|
||||||
for x in (
|
for x in (
|
||||||
'usb-1.0.0', 'mtp.9', 'chm.0', 'sqlite3.0', 'hunspell-1.7.0',
|
'usb-1.0.0', 'mtp.9', 'chm.0', 'sqlite3.0', 'hunspell-1.7.0',
|
||||||
'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0',
|
'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0', 'uchardet.0',
|
||||||
'stemmer.0', 'xslt.1', 'exslt.0', 'xml2.2', 'z.1', 'unrar', 'lzma.5',
|
'stemmer.0', 'xslt.1', 'exslt.0', 'xml2.2', 'z.1', 'unrar', 'lzma.5',
|
||||||
'crypto.1.1', 'ssl.1.1', 'iconv.2', # 'ltdl.7'
|
'crypto.1.1', 'ssl.1.1', 'iconv.2', # 'ltdl.7'
|
||||||
):
|
):
|
||||||
|
@ -700,11 +700,11 @@
|
|||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "cchardet",
|
"name": "uchardet",
|
||||||
"unix": {
|
"unix": {
|
||||||
"filename": "cchardet-2.1.7.tar.gz",
|
"filename": "uchardet-0.0.7.tar.xz",
|
||||||
"hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf",
|
"hash": "sha256:3fc79408ae1d84b406922fa9319ce005631c95ca0f34b205fad867e8b30e45b1",
|
||||||
"urls": ["pypi"]
|
"urls": ["https://www.freedesktop.org/software/uchardet/releases/{filename}"]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
set -xe
|
set -xe
|
||||||
|
|
||||||
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet libstemmer poppler
|
pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip uchardet libstemmer poppler
|
||||||
|
|
||||||
useradd -m ci
|
useradd -m ci
|
||||||
chown -R ci:users $GITHUB_WORKSPACE
|
chown -R ci:users $GITHUB_WORKSPACE
|
||||||
|
@ -130,6 +130,7 @@ hunspell_inc_dirs = []
|
|||||||
hunspell_lib_dirs = []
|
hunspell_lib_dirs = []
|
||||||
hyphen_inc_dirs = []
|
hyphen_inc_dirs = []
|
||||||
hyphen_lib_dirs = []
|
hyphen_lib_dirs = []
|
||||||
|
uchardet_inc_dirs, uchardet_lib_dirs, uchardet_libs = [], [], ['uchardet']
|
||||||
openssl_inc_dirs, openssl_lib_dirs = [], []
|
openssl_inc_dirs, openssl_lib_dirs = [], []
|
||||||
ICU = sw = ''
|
ICU = sw = ''
|
||||||
|
|
||||||
@ -143,6 +144,8 @@ if iswindows:
|
|||||||
hyphen_lib_dirs = [sw_lib_dir]
|
hyphen_lib_dirs = [sw_lib_dir]
|
||||||
openssl_inc_dirs = [sw_inc_dir]
|
openssl_inc_dirs = [sw_inc_dir]
|
||||||
openssl_lib_dirs = [sw_lib_dir]
|
openssl_lib_dirs = [sw_lib_dir]
|
||||||
|
uchardet_inc_dirs = [sw_inc_dir]
|
||||||
|
uchardet_lib_dirs = [sw_lib_dir]
|
||||||
sqlite_inc_dirs = [sw_inc_dir]
|
sqlite_inc_dirs = [sw_inc_dir]
|
||||||
chmlib_inc_dirs = [sw_inc_dir]
|
chmlib_inc_dirs = [sw_inc_dir]
|
||||||
chmlib_lib_dirs = [sw_lib_dir]
|
chmlib_lib_dirs = [sw_lib_dir]
|
||||||
@ -165,6 +168,7 @@ elif ismacos:
|
|||||||
podofo_lib = sw_lib_dir
|
podofo_lib = sw_lib_dir
|
||||||
ft_libs = ['freetype']
|
ft_libs = ['freetype']
|
||||||
ft_inc_dirs = [sw + '/include/freetype2']
|
ft_inc_dirs = [sw + '/include/freetype2']
|
||||||
|
uchardet_inc_dirs = [sw + '/include/uchardet']
|
||||||
SSL = os.environ.get('OPENSSL_DIR', os.path.join(sw, 'private', 'ssl'))
|
SSL = os.environ.get('OPENSSL_DIR', os.path.join(sw, 'private', 'ssl'))
|
||||||
openssl_inc_dirs = [os.path.join(SSL, 'include')]
|
openssl_inc_dirs = [os.path.join(SSL, 'include')]
|
||||||
openssl_lib_dirs = [os.path.join(SSL, 'lib')]
|
openssl_lib_dirs = [os.path.join(SSL, 'lib')]
|
||||||
@ -183,6 +187,9 @@ else:
|
|||||||
if not os.path.exists(podofo_inc + '/podofo.h'):
|
if not os.path.exists(podofo_inc + '/podofo.h'):
|
||||||
podofo_inc = os.path.join(sw, 'include', 'podofo')
|
podofo_inc = os.path.join(sw, 'include', 'podofo')
|
||||||
podofo_lib = os.path.join(sw, 'lib')
|
podofo_lib = os.path.join(sw, 'lib')
|
||||||
|
uchardet_inc_dirs = pkgconfig_include_dirs('uchardet', '', '/usr/include/uchardet')
|
||||||
|
uchardet_lib_dirs = pkgconfig_lib_dirs('uchardet', '', '/usr/lib')
|
||||||
|
uchardet_libs = pkgconfig_libs('uchardet', '', '')
|
||||||
|
|
||||||
|
|
||||||
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
||||||
|
@ -16,6 +16,13 @@
|
|||||||
"lib_dirs": "!hyphen_lib_dirs",
|
"lib_dirs": "!hyphen_lib_dirs",
|
||||||
"needs_c99": true
|
"needs_c99": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "uchardet",
|
||||||
|
"sources": "calibre/ebooks/uchardet.c",
|
||||||
|
"libraries": "!uchardet_libs",
|
||||||
|
"inc_dirs": "!uchardet_inc_dirs",
|
||||||
|
"lib_dirs": "!uchardet_lib_dirs"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "unicode_names",
|
"name": "unicode_names",
|
||||||
"headers": "unicode_names/names.h unicode_names/data-types.h",
|
"headers": "unicode_names/names.h unicode_names/data-types.h",
|
||||||
|
@ -266,6 +266,7 @@ class ExtensionsImporter:
|
|||||||
'tokenizer',
|
'tokenizer',
|
||||||
'certgen',
|
'certgen',
|
||||||
'sqlite_extension',
|
'sqlite_extension',
|
||||||
|
'uchardet',
|
||||||
)
|
)
|
||||||
if iswindows:
|
if iswindows:
|
||||||
extra = ('winutil', 'wpd', 'winfonts', 'winsapi')
|
extra = ('winutil', 'wpd', 'winfonts', 'winsapi')
|
||||||
|
@ -103,16 +103,18 @@ _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
|
|||||||
|
|
||||||
|
|
||||||
def detect(bytestring):
|
def detect(bytestring):
|
||||||
from cchardet import detect as implementation
|
if isinstance(bytestring, str):
|
||||||
ans = implementation(bytestring)
|
bytestring = bytestring.encode('utf-8', 'replace')
|
||||||
enc = ans.get('encoding')
|
try:
|
||||||
if enc:
|
from calibre_extensions.uchardet import detect as implementation
|
||||||
ans['encoding'] = enc.lower()
|
except ImportError:
|
||||||
elif enc is None:
|
# People running from source without updated binaries
|
||||||
ans['encoding'] = ''
|
from cchardet import detect as cdi
|
||||||
if ans.get('confidence') is None:
|
|
||||||
ans['confidence'] = 0
|
def implementation(x):
|
||||||
return ans
|
return cdi(x).get('encoding') or ''
|
||||||
|
enc = implementation(bytestring).lower()
|
||||||
|
return {'encoding': enc, 'confidence': 1 if enc else 0}
|
||||||
|
|
||||||
|
|
||||||
def force_encoding(raw, verbose, assume_utf8=False):
|
def force_encoding(raw, verbose, assume_utf8=False):
|
||||||
|
65
src/calibre/ebooks/uchardet.c
Normal file
65
src/calibre/ebooks/uchardet.c
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
/*
|
||||||
|
* uchardet.c
|
||||||
|
* Copyright (C) 2022 Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
*
|
||||||
|
* Distributed under terms of the GPL3 license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "Python.h"
|
||||||
|
#include <uchardet.h>
|
||||||
|
|
||||||
|
#define CAPSULE_NAME "uchardet.detector_capsule"
|
||||||
|
#define CAPSULE_ATTR "detector_capsule"
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
detect(PyObject *self, PyObject *bytes) {
|
||||||
|
if (!PyBytes_Check(bytes)) { PyErr_SetString(PyExc_TypeError, "a byte string is required"); return NULL; }
|
||||||
|
PyObject *capsule = PyObject_GetAttrString(self, CAPSULE_ATTR);
|
||||||
|
if (!capsule) return NULL;
|
||||||
|
void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
|
||||||
|
if (!d) return NULL;
|
||||||
|
uchardet_reset(d);
|
||||||
|
uchardet_handle_data(d, PyBytes_AS_STRING(bytes), (size_t)PyBytes_GET_SIZE(bytes));
|
||||||
|
uchardet_data_end(d);
|
||||||
|
return PyUnicode_FromString(uchardet_get_charset(d));
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyMethodDef methods[] = {
|
||||||
|
{"detect", detect, METH_O,
|
||||||
|
"detect(bytestring) -> encoding name\n\n"
|
||||||
|
"Detect the encoding of the specified bytestring"
|
||||||
|
},
|
||||||
|
{NULL, NULL, 0, NULL}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
free_detector(PyObject *capsule) {
|
||||||
|
void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
|
||||||
|
if (d) uchardet_delete(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
exec_module(PyObject *module) {
|
||||||
|
uchardet_t detector = uchardet_new();
|
||||||
|
if (!detector) { PyErr_NoMemory(); return -1; }
|
||||||
|
PyObject *detector_capsule = PyCapsule_New(detector, CAPSULE_NAME, free_detector);
|
||||||
|
if (!detector_capsule) return -1;
|
||||||
|
int ret = PyModule_AddObjectRef(module, CAPSULE_ATTR, detector_capsule);
|
||||||
|
Py_DECREF(detector_capsule);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyModuleDef_Slot slots[] = { {Py_mod_exec, exec_module}, {0, NULL} };
|
||||||
|
|
||||||
|
static struct PyModuleDef module_def = {
|
||||||
|
.m_base = PyModuleDef_HEAD_INIT,
|
||||||
|
.m_name = "uchardet",
|
||||||
|
.m_doc = "Detect the encoding of bytestring",
|
||||||
|
.m_methods = methods,
|
||||||
|
.m_slots = slots,
|
||||||
|
};
|
||||||
|
|
||||||
|
CALIBRE_MODINIT_FUNC PyInit_uchardet(void) {
|
||||||
|
return PyModuleDef_Init(&module_def);
|
||||||
|
}
|
@ -73,11 +73,10 @@ class BuildTest(unittest.TestCase):
|
|||||||
del CHMFile, chmlib
|
del CHMFile, chmlib
|
||||||
|
|
||||||
def test_chardet(self):
|
def test_chardet(self):
|
||||||
from cchardet import detect
|
from calibre_extensions.uchardet import detect
|
||||||
raw = 'mūsi Füße'.encode()
|
raw = 'mūsi Füße'.encode()
|
||||||
data = detect(raw)
|
enc = detect(raw).lower()
|
||||||
self.assertEqual(data['encoding'].lower(), 'utf-8')
|
self.assertEqual(enc, 'utf-8')
|
||||||
self.assertGreater(data['confidence'], 0.5)
|
|
||||||
# The following is used by html5lib
|
# The following is used by html5lib
|
||||||
from chardet.universaldetector import UniversalDetector
|
from chardet.universaldetector import UniversalDetector
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user