Add a function to efficiently convert a unicode string into a tuple of unicode code points

2025-07-08 18:54:09 -04:00 · 2015-01-21 18:35:22 +05:30 · 2015-01-21 18:35:22 +05:30 · 95e36e66e7
commit 95e36e66e7
parent 9fa83858a3
5 changed files with 69 additions and 7 deletions
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -968,18 +968,34 @@ icu_chr(PyObject *self, PyObject *args) {
    UChar32 code = 0;
    UChar buf[5] = {0};
    int32_t sz = 0;
-    char utf8[21];
-    PyObject *result = NULL;
  
    if (!PyArg_ParseTuple(args, "I", &code)) return NULL;

    u_strFromUTF32(buf, 4, &sz, &code, 1, &status);
-    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
-    u_strToUTF8(utf8, 20, &sz, buf, sz, &status);
-    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
-    result = PyUnicode_DecodeUTF8(utf8, sz, "strict");
+    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); return NULL; }
+    return icu_to_python(buf, sz);
+} // }}}
+
+// ord_string {{{
+static PyObject *
+icu_ord_string(PyObject *self, PyObject *input) {
+    UChar32 *input_buf = NULL;
+    int32_t sz = 0, i = 0;
+    PyObject *ans = NULL, *temp = NULL;
+
+    input_buf = python_to_icu32(input, &sz, 1);
+    if (input_buf == NULL) goto end;
+    ans = PyTuple_New(sz);
+    if (ans == NULL) goto end;
+    for (i = 0; i < sz; i++) {
+        temp = PyInt_FromLong((long)input_buf[i]);
+        if (temp == NULL) { Py_DECREF(ans); ans = NULL; PyErr_NoMemory(); goto end; }
+        PyTuple_SET_ITEM(ans, i, temp);
+    }
 end:
-    return result;
+    if (input_buf != NULL) free(input_buf);
+    return ans;
+  
 } // }}}

 // normalize {{{
@ -1130,6 +1146,10 @@ static PyMethodDef icu_methods[] = {
     "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
    },

+    {"ord_string", icu_ord_string, METH_O, 
+     "ord_string(code) -> Convert a python unicode string to a tuple of unicode codepoints."
+    },
+
    {"normalize", icu_normalize, METH_VARARGS, 
     "normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
    },
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -231,6 +231,12 @@ primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator

 safe_chr = _icu.chr

+try:
+    ord_string = _icu.ord_string
+except AttributeError:
+    # People running from source
+    ord_string = lambda x: tuple(map(ord, x))
+
 def character_name(string):
    try:
        return _icu.character_name(unicode(string)) or None
--- a/src/calibre/utils/icu_calibre_utils.h
+++ b/src/calibre/utils/icu_calibre_utils.h
@ -59,6 +59,38 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
 end:
    return ans;
 }
+
+#ifndef NO_PYTHON_TO_ICU32
+static UChar32* python_to_icu32(PyObject *obj, int32_t *osz, uint8_t do_check) {
+    UChar32 *ans = NULL;
+    Py_ssize_t sz = 0;
+#ifndef Py_UNICODE_WIDE
+    UErrorCode status = U_ZERO_ERROR;
+#endif
+
+    if (do_check && !PyUnicode_CheckExact(obj)) {
+        PyErr_SetString(PyExc_TypeError, "Not a unicode string");
+        goto end;
+    }
+
+    sz = PyUnicode_GET_DATA_SIZE(obj);
+    ans = (UChar32*) calloc(sz+1, 1);  // Ensure null termination
+    if (ans == NULL) { PyErr_NoMemory(); goto end; }
+
+#ifdef Py_UNICODE_WIDE
+// wide build (UCS 4)
+    memcpy(ans, PyUnicode_AS_UNICODE(obj), sz);
+    if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
+#else
+// narrow build (UTF-16)
+    u_strToUTF32(ans, sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), &status);
+    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
+#endif
+end:
+    return ans;
+}
+#endif
+
 #endif

 #ifndef NO_ICU_TO_PYTHON
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -136,6 +136,9 @@ class TestICU(unittest.TestCase):
            self.ae(icu._icu.string_length(x), l)
        for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
            self.ae(icu._icu.utf16_length(x), l)
+        self.ae(icu._icu.chr(0x1f431), '\U0001f431')
+        self.ae(icu._icu.ord_string('abc'), tuple(map(ord, 'abc')))
+        self.ae(icu._icu.ord_string('\U0001f431'), (0x1f431,))

    def test_character_name(self):
        ' Test character naming '
--- a/src/calibre/utils/matcher.c
+++ b/src/calibre/utils/matcher.c
@ -6,6 +6,7 @@
 */

 #define NO_ICU_TO_PYTHON
+#define NO_PYTHON_TO_ICU32
 #include "icu_calibre_utils.h"
 #include <float.h>