diff --git a/bypy/linux/__main__.py b/bypy/linux/__main__.py index 863b31357e..bad81744bf 100644 --- a/bypy/linux/__main__.py +++ b/bypy/linux/__main__.py @@ -45,7 +45,7 @@ def binary_includes(): get_dll_path, ('usb-1.0 mtp expat sqlite3 ffi z lzma openjp2 poppler dbus-1 iconv xml2 xslt jpeg png16' ' webp webpmux webpdemux exslt ncursesw readline chm hunspell-1.7 hyphen' - ' icudata icui18n icuuc icuio stemmer gcrypt gpg-error' + ' icudata icui18n icuuc icuio stemmer gcrypt gpg-error uchardet' ' gobject-2.0 glib-2.0 gthread-2.0 gmodule-2.0 gio-2.0 dbus-glib-1').split() )) + [ # debian/ubuntu for for some typical stupid reason use libpcre.so.3 diff --git a/bypy/macos/__main__.py b/bypy/macos/__main__.py index f80fbbf7a2..b922ce7b58 100644 --- a/bypy/macos/__main__.py +++ b/bypy/macos/__main__.py @@ -527,7 +527,7 @@ class Freeze: def add_misc_libraries(self): for x in ( 'usb-1.0.0', 'mtp.9', 'chm.0', 'sqlite3.0', 'hunspell-1.7.0', - 'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0', + 'icudata.70', 'icui18n.70', 'icuio.70', 'icuuc.70', 'hyphen.0', 'uchardet.0', 'stemmer.0', 'xslt.1', 'exslt.0', 'xml2.2', 'z.1', 'unrar', 'lzma.5', 'crypto.1.1', 'ssl.1.1', 'iconv.2', # 'ltdl.7' ): diff --git a/bypy/sources.json b/bypy/sources.json index 217ba2c90c..6e3a8fd00e 100644 --- a/bypy/sources.json +++ b/bypy/sources.json @@ -700,11 +700,11 @@ }, { - "name": "cchardet", + "name": "uchardet", "unix": { - "filename": "cchardet-2.1.7.tar.gz", - "hash": "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf", - "urls": ["pypi"] + "filename": "uchardet-0.0.7.tar.xz", + "hash": "sha256:3fc79408ae1d84b406922fa9319ce005631c95ca0f34b205fad867e8b30e45b1", + "urls": ["https://www.freedesktop.org/software/uchardet/releases/{filename}"] } }, diff --git a/setup/arch-ci.sh b/setup/arch-ci.sh index 0c45ddac5e..c390e6fbbb 100755 --- a/setup/arch-ci.sh +++ b/setup/arch-ci.sh @@ -5,7 +5,7 @@ set -xe -pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip python-cchardet libstemmer poppler +pacman -S --noconfirm --needed base-devel sudo git sip pyqt-builder cmake chmlib icu jxrlib hunspell libmtp libusb libwmf optipng podofo python-apsw python-beautifulsoup4 python-cssselect python-css-parser python-dateutil python-jeepney python-dnspython python-feedparser python-html2text python-html5-parser python-lxml python-markdown python-mechanize python-msgpack python-netifaces python-unrardll python-pillow python-psutil python-pygments python-pyqt6 python-regex python-zeroconf python-pyqt6-webengine qt6-svg qt6-imageformats udisks2 hyphen python-pychm python-pycryptodome speech-dispatcher python-sphinx python-urllib3 python-py7zr python-pip uchardet libstemmer poppler useradd -m ci chown -R ci:users $GITHUB_WORKSPACE diff --git a/setup/build_environment.py b/setup/build_environment.py index ad5f77c82d..eb0a87a7c6 100644 --- a/setup/build_environment.py +++ b/setup/build_environment.py @@ -130,6 +130,7 @@ hunspell_inc_dirs = [] hunspell_lib_dirs = [] hyphen_inc_dirs = [] hyphen_lib_dirs = [] +uchardet_inc_dirs, uchardet_lib_dirs, uchardet_libs = [], [], ['uchardet'] openssl_inc_dirs, openssl_lib_dirs = [], [] ICU = sw = '' @@ -143,6 +144,8 @@ if iswindows: hyphen_lib_dirs = [sw_lib_dir] openssl_inc_dirs = [sw_inc_dir] openssl_lib_dirs = [sw_lib_dir] + uchardet_inc_dirs = [sw_inc_dir] + uchardet_lib_dirs = [sw_lib_dir] sqlite_inc_dirs = [sw_inc_dir] chmlib_inc_dirs = [sw_inc_dir] chmlib_lib_dirs = [sw_lib_dir] @@ -165,6 +168,7 @@ elif ismacos: podofo_lib = sw_lib_dir ft_libs = ['freetype'] ft_inc_dirs = [sw + '/include/freetype2'] + uchardet_inc_dirs = [sw + '/include/uchardet'] SSL = os.environ.get('OPENSSL_DIR', os.path.join(sw, 'private', 'ssl')) openssl_inc_dirs = [os.path.join(SSL, 'include')] openssl_lib_dirs = [os.path.join(SSL, 'lib')] @@ -183,6 +187,9 @@ else: if not os.path.exists(podofo_inc + '/podofo.h'): podofo_inc = os.path.join(sw, 'include', 'podofo') podofo_lib = os.path.join(sw, 'lib') + uchardet_inc_dirs = pkgconfig_include_dirs('uchardet', '', '/usr/include/uchardet') + uchardet_lib_dirs = pkgconfig_lib_dirs('uchardet', '', '/usr/lib') + uchardet_libs = pkgconfig_libs('uchardet', '', '') podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib) diff --git a/setup/extensions.json b/setup/extensions.json index 29625fd870..ec8987ad9c 100644 --- a/setup/extensions.json +++ b/setup/extensions.json @@ -16,6 +16,13 @@ "lib_dirs": "!hyphen_lib_dirs", "needs_c99": true }, + { + "name": "uchardet", + "sources": "calibre/ebooks/uchardet.c", + "libraries": "!uchardet_libs", + "inc_dirs": "!uchardet_inc_dirs", + "lib_dirs": "!uchardet_lib_dirs" + }, { "name": "unicode_names", "headers": "unicode_names/names.h unicode_names/data-types.h", diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 6eebc5467c..1a3301e625 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -266,6 +266,7 @@ class ExtensionsImporter: 'tokenizer', 'certgen', 'sqlite_extension', + 'uchardet', ) if iswindows: extra = ('winutil', 'wpd', 'winfonts', 'winsapi') diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 53fe6c5108..4aeceea0fd 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -103,16 +103,18 @@ _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"} def detect(bytestring): - from cchardet import detect as implementation - ans = implementation(bytestring) - enc = ans.get('encoding') - if enc: - ans['encoding'] = enc.lower() - elif enc is None: - ans['encoding'] = '' - if ans.get('confidence') is None: - ans['confidence'] = 0 - return ans + if isinstance(bytestring, str): + bytestring = bytestring.encode('utf-8', 'replace') + try: + from calibre_extensions.uchardet import detect as implementation + except ImportError: + # People running from source without updated binaries + from cchardet import detect as cdi + + def implementation(x): + return cdi(x).get('encoding') or '' + enc = implementation(bytestring).lower() + return {'encoding': enc, 'confidence': 1 if enc else 0} def force_encoding(raw, verbose, assume_utf8=False): diff --git a/src/calibre/ebooks/uchardet.c b/src/calibre/ebooks/uchardet.c new file mode 100644 index 0000000000..fe5895c840 --- /dev/null +++ b/src/calibre/ebooks/uchardet.c @@ -0,0 +1,65 @@ +/* + * uchardet.c + * Copyright (C) 2022 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#include "Python.h" +#include + +#define CAPSULE_NAME "uchardet.detector_capsule" +#define CAPSULE_ATTR "detector_capsule" + +static PyObject* +detect(PyObject *self, PyObject *bytes) { + if (!PyBytes_Check(bytes)) { PyErr_SetString(PyExc_TypeError, "a byte string is required"); return NULL; } + PyObject *capsule = PyObject_GetAttrString(self, CAPSULE_ATTR); + if (!capsule) return NULL; + void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME); + if (!d) return NULL; + uchardet_reset(d); + uchardet_handle_data(d, PyBytes_AS_STRING(bytes), (size_t)PyBytes_GET_SIZE(bytes)); + uchardet_data_end(d); + return PyUnicode_FromString(uchardet_get_charset(d)); +} + +static PyMethodDef methods[] = { + {"detect", detect, METH_O, + "detect(bytestring) -> encoding name\n\n" + "Detect the encoding of the specified bytestring" + }, + {NULL, NULL, 0, NULL} +}; + + +static void +free_detector(PyObject *capsule) { + void *d = PyCapsule_GetPointer(capsule, CAPSULE_NAME); + if (d) uchardet_delete(d); +} + +static int +exec_module(PyObject *module) { + uchardet_t detector = uchardet_new(); + if (!detector) { PyErr_NoMemory(); return -1; } + PyObject *detector_capsule = PyCapsule_New(detector, CAPSULE_NAME, free_detector); + if (!detector_capsule) return -1; + int ret = PyModule_AddObjectRef(module, CAPSULE_ATTR, detector_capsule); + Py_DECREF(detector_capsule); + return ret; +} + +static PyModuleDef_Slot slots[] = { {Py_mod_exec, exec_module}, {0, NULL} }; + +static struct PyModuleDef module_def = { + .m_base = PyModuleDef_HEAD_INIT, + .m_name = "uchardet", + .m_doc = "Detect the encoding of bytestring", + .m_methods = methods, + .m_slots = slots, +}; + +CALIBRE_MODINIT_FUNC PyInit_uchardet(void) { + return PyModuleDef_Init(&module_def); +} diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py index e8364ab9ee..f6bc4b05af 100644 --- a/src/calibre/test_build.py +++ b/src/calibre/test_build.py @@ -73,11 +73,10 @@ class BuildTest(unittest.TestCase): del CHMFile, chmlib def test_chardet(self): - from cchardet import detect + from calibre_extensions.uchardet import detect raw = 'mūsi Füße'.encode() - data = detect(raw) - self.assertEqual(data['encoding'].lower(), 'utf-8') - self.assertGreater(data['confidence'], 0.5) + enc = detect(raw).lower() + self.assertEqual(enc, 'utf-8') # The following is used by html5lib from chardet.universaldetector import UniversalDetector detector = UniversalDetector()