Add a function to efficiently convert a unicode string into a tuple of unicode code points

This commit is contained in:
Kovid Goyal 2015-01-21 18:35:22 +05:30
parent 9fa83858a3
commit 95e36e66e7
5 changed files with 69 additions and 7 deletions

View File

@ -968,18 +968,34 @@ icu_chr(PyObject *self, PyObject *args) {
UChar32 code = 0;
UChar buf[5] = {0};
int32_t sz = 0;
char utf8[21];
PyObject *result = NULL;
if (!PyArg_ParseTuple(args, "I", &code)) return NULL;
u_strFromUTF32(buf, 4, &sz, &code, 1, &status);
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
u_strToUTF8(utf8, 20, &sz, buf, sz, &status);
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
result = PyUnicode_DecodeUTF8(utf8, sz, "strict");
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); return NULL; }
return icu_to_python(buf, sz);
} // }}}
// ord_string {{{
static PyObject *
icu_ord_string(PyObject *self, PyObject *input) {
UChar32 *input_buf = NULL;
int32_t sz = 0, i = 0;
PyObject *ans = NULL, *temp = NULL;
input_buf = python_to_icu32(input, &sz, 1);
if (input_buf == NULL) goto end;
ans = PyTuple_New(sz);
if (ans == NULL) goto end;
for (i = 0; i < sz; i++) {
temp = PyInt_FromLong((long)input_buf[i]);
if (temp == NULL) { Py_DECREF(ans); ans = NULL; PyErr_NoMemory(); goto end; }
PyTuple_SET_ITEM(ans, i, temp);
}
end:
return result;
if (input_buf != NULL) free(input_buf);
return ans;
} // }}}
// normalize {{{
@ -1130,6 +1146,10 @@ static PyMethodDef icu_methods[] = {
"chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
},
{"ord_string", icu_ord_string, METH_O,
"ord_string(code) -> Convert a python unicode string to a tuple of unicode codepoints."
},
{"normalize", icu_normalize, METH_VARARGS,
"normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
},

View File

@ -231,6 +231,12 @@ primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator
safe_chr = _icu.chr
try:
ord_string = _icu.ord_string
except AttributeError:
# People running from source
ord_string = lambda x: tuple(map(ord, x))
def character_name(string):
try:
return _icu.character_name(unicode(string)) or None

View File

@ -59,6 +59,38 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
end:
return ans;
}
#ifndef NO_PYTHON_TO_ICU32
static UChar32* python_to_icu32(PyObject *obj, int32_t *osz, uint8_t do_check) {
UChar32 *ans = NULL;
Py_ssize_t sz = 0;
#ifndef Py_UNICODE_WIDE
UErrorCode status = U_ZERO_ERROR;
#endif
if (do_check && !PyUnicode_CheckExact(obj)) {
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
goto end;
}
sz = PyUnicode_GET_DATA_SIZE(obj);
ans = (UChar32*) calloc(sz+1, 1); // Ensure null termination
if (ans == NULL) { PyErr_NoMemory(); goto end; }
#ifdef Py_UNICODE_WIDE
// wide build (UCS 4)
memcpy(ans, PyUnicode_AS_UNICODE(obj), sz);
if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
#else
// narrow build (UTF-16)
u_strToUTF32(ans, sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), &status);
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
#endif
end:
return ans;
}
#endif
#endif
#ifndef NO_ICU_TO_PYTHON

View File

@ -136,6 +136,9 @@ class TestICU(unittest.TestCase):
self.ae(icu._icu.string_length(x), l)
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
self.ae(icu._icu.utf16_length(x), l)
self.ae(icu._icu.chr(0x1f431), '\U0001f431')
self.ae(icu._icu.ord_string('abc'), tuple(map(ord, 'abc')))
self.ae(icu._icu.ord_string('\U0001f431'), (0x1f431,))
def test_character_name(self):
' Test character naming '

View File

@ -6,6 +6,7 @@
*/
#define NO_ICU_TO_PYTHON
#define NO_PYTHON_TO_ICU32
#include "icu_calibre_utils.h"
#include <float.h>