mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a function to efficiently convert a unicode string into a tuple of unicode code points
This commit is contained in:
parent
9fa83858a3
commit
95e36e66e7
@ -968,18 +968,34 @@ icu_chr(PyObject *self, PyObject *args) {
|
|||||||
UChar32 code = 0;
|
UChar32 code = 0;
|
||||||
UChar buf[5] = {0};
|
UChar buf[5] = {0};
|
||||||
int32_t sz = 0;
|
int32_t sz = 0;
|
||||||
char utf8[21];
|
|
||||||
PyObject *result = NULL;
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "I", &code)) return NULL;
|
if (!PyArg_ParseTuple(args, "I", &code)) return NULL;
|
||||||
|
|
||||||
u_strFromUTF32(buf, 4, &sz, &code, 1, &status);
|
u_strFromUTF32(buf, 4, &sz, &code, 1, &status);
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); return NULL; }
|
||||||
u_strToUTF8(utf8, 20, &sz, buf, sz, &status);
|
return icu_to_python(buf, sz);
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; }
|
} // }}}
|
||||||
result = PyUnicode_DecodeUTF8(utf8, sz, "strict");
|
|
||||||
|
// ord_string {{{
|
||||||
|
static PyObject *
|
||||||
|
icu_ord_string(PyObject *self, PyObject *input) {
|
||||||
|
UChar32 *input_buf = NULL;
|
||||||
|
int32_t sz = 0, i = 0;
|
||||||
|
PyObject *ans = NULL, *temp = NULL;
|
||||||
|
|
||||||
|
input_buf = python_to_icu32(input, &sz, 1);
|
||||||
|
if (input_buf == NULL) goto end;
|
||||||
|
ans = PyTuple_New(sz);
|
||||||
|
if (ans == NULL) goto end;
|
||||||
|
for (i = 0; i < sz; i++) {
|
||||||
|
temp = PyInt_FromLong((long)input_buf[i]);
|
||||||
|
if (temp == NULL) { Py_DECREF(ans); ans = NULL; PyErr_NoMemory(); goto end; }
|
||||||
|
PyTuple_SET_ITEM(ans, i, temp);
|
||||||
|
}
|
||||||
end:
|
end:
|
||||||
return result;
|
if (input_buf != NULL) free(input_buf);
|
||||||
|
return ans;
|
||||||
|
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
// normalize {{{
|
// normalize {{{
|
||||||
@ -1130,6 +1146,10 @@ static PyMethodDef icu_methods[] = {
|
|||||||
"chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
|
"chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{"ord_string", icu_ord_string, METH_O,
|
||||||
|
"ord_string(code) -> Convert a python unicode string to a tuple of unicode codepoints."
|
||||||
|
},
|
||||||
|
|
||||||
{"normalize", icu_normalize, METH_VARARGS,
|
{"normalize", icu_normalize, METH_VARARGS,
|
||||||
"normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
|
"normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
|
||||||
},
|
},
|
||||||
|
@ -231,6 +231,12 @@ primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator
|
|||||||
|
|
||||||
safe_chr = _icu.chr
|
safe_chr = _icu.chr
|
||||||
|
|
||||||
|
try:
|
||||||
|
ord_string = _icu.ord_string
|
||||||
|
except AttributeError:
|
||||||
|
# People running from source
|
||||||
|
ord_string = lambda x: tuple(map(ord, x))
|
||||||
|
|
||||||
def character_name(string):
|
def character_name(string):
|
||||||
try:
|
try:
|
||||||
return _icu.character_name(unicode(string)) or None
|
return _icu.character_name(unicode(string)) or None
|
||||||
|
@ -59,6 +59,38 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
|||||||
end:
|
end:
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef NO_PYTHON_TO_ICU32
|
||||||
|
static UChar32* python_to_icu32(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
||||||
|
UChar32 *ans = NULL;
|
||||||
|
Py_ssize_t sz = 0;
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (do_check && !PyUnicode_CheckExact(obj)) {
|
||||||
|
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
|
||||||
|
goto end;
|
||||||
|
}
|
||||||
|
|
||||||
|
sz = PyUnicode_GET_DATA_SIZE(obj);
|
||||||
|
ans = (UChar32*) calloc(sz+1, 1); // Ensure null termination
|
||||||
|
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
// wide build (UCS 4)
|
||||||
|
memcpy(ans, PyUnicode_AS_UNICODE(obj), sz);
|
||||||
|
if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
|
||||||
|
#else
|
||||||
|
// narrow build (UTF-16)
|
||||||
|
u_strToUTF32(ans, sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), &status);
|
||||||
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
|
||||||
|
#endif
|
||||||
|
end:
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef NO_ICU_TO_PYTHON
|
#ifndef NO_ICU_TO_PYTHON
|
||||||
|
@ -136,6 +136,9 @@ class TestICU(unittest.TestCase):
|
|||||||
self.ae(icu._icu.string_length(x), l)
|
self.ae(icu._icu.string_length(x), l)
|
||||||
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
|
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
|
||||||
self.ae(icu._icu.utf16_length(x), l)
|
self.ae(icu._icu.utf16_length(x), l)
|
||||||
|
self.ae(icu._icu.chr(0x1f431), '\U0001f431')
|
||||||
|
self.ae(icu._icu.ord_string('abc'), tuple(map(ord, 'abc')))
|
||||||
|
self.ae(icu._icu.ord_string('\U0001f431'), (0x1f431,))
|
||||||
|
|
||||||
def test_character_name(self):
|
def test_character_name(self):
|
||||||
' Test character naming '
|
' Test character naming '
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#define NO_ICU_TO_PYTHON
|
#define NO_ICU_TO_PYTHON
|
||||||
|
#define NO_PYTHON_TO_ICU32
|
||||||
#include "icu_calibre_utils.h"
|
#include "icu_calibre_utils.h"
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user