mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Add a function to efficiently convert a unicode string into a tuple of unicode code points
This commit is contained in:
parent
9fa83858a3
commit
95e36e66e7
@ -968,18 +968,34 @@ icu_chr(PyObject *self, PyObject *args) {
|
||||
UChar32 code = 0;
|
||||
UChar buf[5] = {0};
|
||||
int32_t sz = 0;
|
||||
char utf8[21];
|
||||
PyObject *result = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "I", &code)) return NULL;
|
||||
|
||||
u_strFromUTF32(buf, 4, &sz, &code, 1, &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
|
||||
u_strToUTF8(utf8, 20, &sz, buf, sz, &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
|
||||
result = PyUnicode_DecodeUTF8(utf8, sz, "strict");
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); return NULL; }
|
||||
return icu_to_python(buf, sz);
|
||||
} // }}}
|
||||
|
||||
// ord_string {{{
|
||||
static PyObject *
|
||||
icu_ord_string(PyObject *self, PyObject *input) {
|
||||
UChar32 *input_buf = NULL;
|
||||
int32_t sz = 0, i = 0;
|
||||
PyObject *ans = NULL, *temp = NULL;
|
||||
|
||||
input_buf = python_to_icu32(input, &sz, 1);
|
||||
if (input_buf == NULL) goto end;
|
||||
ans = PyTuple_New(sz);
|
||||
if (ans == NULL) goto end;
|
||||
for (i = 0; i < sz; i++) {
|
||||
temp = PyInt_FromLong((long)input_buf[i]);
|
||||
if (temp == NULL) { Py_DECREF(ans); ans = NULL; PyErr_NoMemory(); goto end; }
|
||||
PyTuple_SET_ITEM(ans, i, temp);
|
||||
}
|
||||
end:
|
||||
return result;
|
||||
if (input_buf != NULL) free(input_buf);
|
||||
return ans;
|
||||
|
||||
} // }}}
|
||||
|
||||
// normalize {{{
|
||||
@ -1130,6 +1146,10 @@ static PyMethodDef icu_methods[] = {
|
||||
"chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
|
||||
},
|
||||
|
||||
{"ord_string", icu_ord_string, METH_O,
|
||||
"ord_string(code) -> Convert a python unicode string to a tuple of unicode codepoints."
|
||||
},
|
||||
|
||||
{"normalize", icu_normalize, METH_VARARGS,
|
||||
"normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
|
||||
},
|
||||
|
@ -231,6 +231,12 @@ primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator
|
||||
|
||||
safe_chr = _icu.chr
|
||||
|
||||
try:
|
||||
ord_string = _icu.ord_string
|
||||
except AttributeError:
|
||||
# People running from source
|
||||
ord_string = lambda x: tuple(map(ord, x))
|
||||
|
||||
def character_name(string):
|
||||
try:
|
||||
return _icu.character_name(unicode(string)) or None
|
||||
|
@ -59,6 +59,38 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
||||
end:
|
||||
return ans;
|
||||
}
|
||||
|
||||
#ifndef NO_PYTHON_TO_ICU32
|
||||
static UChar32* python_to_icu32(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
||||
UChar32 *ans = NULL;
|
||||
Py_ssize_t sz = 0;
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
#endif
|
||||
|
||||
if (do_check && !PyUnicode_CheckExact(obj)) {
|
||||
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
|
||||
goto end;
|
||||
}
|
||||
|
||||
sz = PyUnicode_GET_DATA_SIZE(obj);
|
||||
ans = (UChar32*) calloc(sz+1, 1); // Ensure null termination
|
||||
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
// wide build (UCS 4)
|
||||
memcpy(ans, PyUnicode_AS_UNICODE(obj), sz);
|
||||
if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
|
||||
#else
|
||||
// narrow build (UTF-16)
|
||||
u_strToUTF32(ans, sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
|
||||
#endif
|
||||
end:
|
||||
return ans;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef NO_ICU_TO_PYTHON
|
||||
|
@ -136,6 +136,9 @@ class TestICU(unittest.TestCase):
|
||||
self.ae(icu._icu.string_length(x), l)
|
||||
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
|
||||
self.ae(icu._icu.utf16_length(x), l)
|
||||
self.ae(icu._icu.chr(0x1f431), '\U0001f431')
|
||||
self.ae(icu._icu.ord_string('abc'), tuple(map(ord, 'abc')))
|
||||
self.ae(icu._icu.ord_string('\U0001f431'), (0x1f431,))
|
||||
|
||||
def test_character_name(self):
|
||||
' Test character naming '
|
||||
|
@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#define NO_ICU_TO_PYTHON
|
||||
#define NO_PYTHON_TO_ICU32
|
||||
#include "icu_calibre_utils.h"
|
||||
#include <float.h>
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user