diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 8169d29822..774079f9ee 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -968,18 +968,34 @@ icu_chr(PyObject *self, PyObject *args) { UChar32 code = 0; UChar buf[5] = {0}; int32_t sz = 0; - char utf8[21]; - PyObject *result = NULL; if (!PyArg_ParseTuple(args, "I", &code)) return NULL; u_strFromUTF32(buf, 4, &sz, &code, 1, &status); - if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; } - u_strToUTF8(utf8, 20, &sz, buf, sz, &status); - if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); goto end; } - result = PyUnicode_DecodeUTF8(utf8, sz, "strict"); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "arg not in range(0x110000)"); return NULL; } + return icu_to_python(buf, sz); +} // }}} + +// ord_string {{{ +static PyObject * +icu_ord_string(PyObject *self, PyObject *input) { + UChar32 *input_buf = NULL; + int32_t sz = 0, i = 0; + PyObject *ans = NULL, *temp = NULL; + + input_buf = python_to_icu32(input, &sz, 1); + if (input_buf == NULL) goto end; + ans = PyTuple_New(sz); + if (ans == NULL) goto end; + for (i = 0; i < sz; i++) { + temp = PyInt_FromLong((long)input_buf[i]); + if (temp == NULL) { Py_DECREF(ans); ans = NULL; PyErr_NoMemory(); goto end; } + PyTuple_SET_ITEM(ans, i, temp); + } end: - return result; + if (input_buf != NULL) free(input_buf); + return ans; + } // }}} // normalize {{{ @@ -1130,6 +1146,10 @@ static PyMethodDef icu_methods[] = { "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)." }, + {"ord_string", icu_ord_string, METH_O, + "ord_string(code) -> Convert a python unicode string to a tuple of unicode codepoints." + }, + {"normalize", icu_normalize, METH_VARARGS, "normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode." }, diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 0001d1521a..b645a9d395 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -231,6 +231,12 @@ primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator safe_chr = _icu.chr +try: + ord_string = _icu.ord_string +except AttributeError: + # People running from source + ord_string = lambda x: tuple(map(ord, x)) + def character_name(string): try: return _icu.character_name(unicode(string)) or None diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h index 9dc8cd2bfc..ddb11882bc 100644 --- a/src/calibre/utils/icu_calibre_utils.h +++ b/src/calibre/utils/icu_calibre_utils.h @@ -59,6 +59,38 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) { end: return ans; } + +#ifndef NO_PYTHON_TO_ICU32 +static UChar32* python_to_icu32(PyObject *obj, int32_t *osz, uint8_t do_check) { + UChar32 *ans = NULL; + Py_ssize_t sz = 0; +#ifndef Py_UNICODE_WIDE + UErrorCode status = U_ZERO_ERROR; +#endif + + if (do_check && !PyUnicode_CheckExact(obj)) { + PyErr_SetString(PyExc_TypeError, "Not a unicode string"); + goto end; + } + + sz = PyUnicode_GET_DATA_SIZE(obj); + ans = (UChar32*) calloc(sz+1, 1); // Ensure null termination + if (ans == NULL) { PyErr_NoMemory(); goto end; } + +#ifdef Py_UNICODE_WIDE +// wide build (UCS 4) + memcpy(ans, PyUnicode_AS_UNICODE(obj), sz); + if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj); +#else +// narrow build (UTF-16) + u_strToUTF32(ans, sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; } +#endif +end: + return ans; +} +#endif + #endif #ifndef NO_ICU_TO_PYTHON diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 9f8edbb715..a2699c8cc2 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -136,6 +136,9 @@ class TestICU(unittest.TestCase): self.ae(icu._icu.string_length(x), l) for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]: self.ae(icu._icu.utf16_length(x), l) + self.ae(icu._icu.chr(0x1f431), '\U0001f431') + self.ae(icu._icu.ord_string('abc'), tuple(map(ord, 'abc'))) + self.ae(icu._icu.ord_string('\U0001f431'), (0x1f431,)) def test_character_name(self): ' Test character naming ' diff --git a/src/calibre/utils/matcher.c b/src/calibre/utils/matcher.c index c2c2210dad..e96ecfe4f7 100644 --- a/src/calibre/utils/matcher.c +++ b/src/calibre/utils/matcher.c @@ -6,6 +6,7 @@ */ #define NO_ICU_TO_PYTHON +#define NO_PYTHON_TO_ICU32 #include "icu_calibre_utils.h" #include