Function to return the number of utf-16 code points in a python string on wide python builds

2026-06-07 14:35:27 -04:00 · 2014-05-12 17:02:57 +05:30
parent fd17470464
commit 7ee75a8775
3 changed files with 27 additions and 5 deletions
@@ -978,10 +978,6 @@ icu_break_iterator_locales(PyObject *self, PyObject *args) {
 // string_length {{{
 static PyObject *
 icu_string_length(PyObject *self, PyObject *args) {
-#if PY_VERSION_HEX >= 0x03030000 
-#error Not implemented for python >= 3.3
-#endif
-
    int32_t sz = 0;
    UChar *icu = NULL;
    PyObject *src = NULL;
@@ -994,6 +990,20 @@ icu_string_length(PyObject *self, PyObject *args) {
    return Py_BuildValue("i", sz);
 } // }}}

+// utf16_length {{{
+static PyObject *
+icu_utf16_length(PyObject *self, PyObject *args) {
+    int32_t sz = 0;
+    UChar *icu = NULL;
+    PyObject *src = NULL;
+  
+    if (!PyArg_ParseTuple(args, "O", &src)) return NULL;
+    icu = python_to_icu(src, &sz, 1);
+    if (icu == NULL) return NULL;
+    free(icu);
+    return Py_BuildValue("i", sz);
+} // }}}
+
 // Module initialization {{{
 static PyMethodDef icu_methods[] = {
    {"change_case", icu_change_case, METH_VARARGS,
@@ -1037,7 +1047,11 @@ static PyMethodDef icu_methods[] = {
    },

    {"string_length", icu_string_length, METH_VARARGS, 
-     "string_length(string) -> Return the length of a string (number of unicode code points in the string). Useful on anrrow python builds where len() returns an incorrect answer if the string contains surrogate pairs."
+     "string_length(string) -> Return the length of a string (number of unicode code points in the string). Useful on narrow python builds where len() returns an incorrect answer if the string contains surrogate pairs."
+    },
+
+    {"utf16_length", icu_utf16_length, METH_VARARGS, 
+     "utf16_length(string) -> Return the length of a string (number of UTF-16 code points in the string). Useful on wide python builds where len() returns an incorrect answer if the string contains surrogate pairs."
    },

    {NULL}  /* Sentinel */
@@ -281,6 +281,12 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x):
 # Return the number of unicode codepoints in a string
 string_length = _icu.string_length if is_narrow_build else len

+# Return the number of UTF-16 codepoints in a string
+try:
+    utf16_length = len if is_narrow_build else _icu.utf16_length
+except AttributeError:
+    utf16_length = len  # People running from source
+
 ################################################################################

 if __name__ == '__main__':
@@ -133,6 +133,8 @@ class TestICU(unittest.TestCase):
            self.ae(r, icu._icu.roundtrip(r))
        for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]:
            self.ae(icu._icu.string_length(x), l)
+        for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
+            self.ae(icu._icu.utf16_length(x), l)

    def test_character_name(self):
        ' Test character naming '