From 88e9494e6bb3ed296a640564f2cf89f2a63cef47 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 May 2018 09:48:44 +0530 Subject: [PATCH] Replace use of deprecated ICU unorm.h API --- src/calibre/utils/icu.c | 149 +++++++++++++++----------- src/calibre/utils/icu.py | 8 +- src/calibre/utils/icu_calibre_utils.h | 2 +- 3 files changed, 92 insertions(+), 67 deletions(-) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 44eba37878..5e7ae07ce2 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -50,7 +50,7 @@ icu_Collator_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (!PyArg_ParseTuple(args, "s", &loc)) return NULL; collator = ucol_open(loc, &status); - if (collator == NULL || U_FAILURE(status)) { + if (collator == NULL || U_FAILURE(status)) { PyErr_SetString(PyExc_Exception, "Failed to create collator."); return NULL; } @@ -144,7 +144,7 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *input) { UChar *buf = NULL; uint8_t *buf2 = NULL; PyObject *ans = NULL; - + buf = python_to_icu(input, &sz, 1); if (buf == NULL) return NULL; @@ -173,7 +173,7 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args) { int32_t asz = 0, bsz = 0; UChar *a = NULL, *b = NULL; UCollationResult res = UCOL_EQUAL; - + if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; a = python_to_icu(a_, &asz, 1); @@ -182,7 +182,7 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args) { if (b == NULL) goto end; res = ucol_strcoll(self->collator, a, asz, b, bsz); end: - if (a != NULL) free(a); + if (a != NULL) free(a); if (b != NULL) free(b); return (PyErr_Occurred()) ? NULL : Py_BuildValue("i", res); @@ -191,7 +191,7 @@ end: // Collator.find {{{ static PyObject * icu_Collator_find(icu_Collator *self, PyObject *args) { -#if PY_VERSION_HEX >= 0x03030000 +#if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif PyObject *a_ = NULL, *b_ = NULL; @@ -199,7 +199,7 @@ icu_Collator_find(icu_Collator *self, PyObject *args) { int32_t asz = 0, bsz = 0, pos = -1, length = -1; UErrorCode status = U_ZERO_ERROR; UStringSearch *search = NULL; - + if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; a = python_to_icu(a_, &asz, 1); @@ -238,7 +238,7 @@ icu_Collator_contains(icu_Collator *self, PyObject *args) { uint8_t found = 0; UErrorCode status = U_ZERO_ERROR; UStringSearch *search = NULL; - + if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; a = python_to_icu(a_, &asz, 1); @@ -276,7 +276,7 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args) { if (self->contractions == NULL) return PyErr_NoMemory(); self->contractions = ucol_getTailoredSet(self->collator, &status); } - status = U_ZERO_ERROR; + status = U_ZERO_ERROR; count = uset_getItemCount(self->contractions); str = (UChar*)calloc(100, sizeof(UChar)); @@ -299,7 +299,7 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args) { } end: if (str != NULL) free(str); - + return ans; } // }}} @@ -310,7 +310,7 @@ icu_Collator_startswith(icu_Collator *self, PyObject *args) { int32_t asz = 0, bsz = 0; UChar *a = NULL, *b = NULL; uint8_t ans = 0; - + if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; a = python_to_icu(a_, &asz, 1); @@ -320,7 +320,7 @@ icu_Collator_startswith(icu_Collator *self, PyObject *args) { if (asz < bsz) goto end; if (bsz == 0) { ans = 1; goto end; } - + ans = ucol_equal(self->collator, a, bsz, b, bsz); end: @@ -340,7 +340,7 @@ icu_Collator_collation_order(icu_Collator *self, PyObject *a_) { UErrorCode status = U_ZERO_ERROR; UCollationElements *iter = NULL; int order = 0, len = -1; - + a = python_to_icu(a_, &asz, 1); if (a == NULL) goto end; @@ -420,17 +420,17 @@ static PyMethodDef icu_Collator_methods[] = { }; static PyGetSetDef icu_Collator_getsetters[] = { - {(char *)"actual_locale", + {(char *)"actual_locale", (getter)icu_Collator_actual_locale, NULL, (char *)"Actual locale used by this collator.", NULL}, - {(char *)"capsule", + {(char *)"capsule", (getter)icu_Collator_capsule, NULL, (char *)"A capsule enclosing the pointer to the ICU collator struct", NULL}, - {(char *)"display_name", + {(char *)"display_name", (getter)icu_Collator_display_name, NULL, (char *)"Display name of this collator in English. The name reflects the actual data source used.", NULL}, @@ -557,7 +557,7 @@ icu_BreakIterator_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (!PyArg_ParseTuple(args, "is", &break_iterator_type, &locale)) return NULL; break_iterator = ubrk_open(break_iterator_type, locale, NULL, 0, &status); - if (break_iterator == NULL || U_FAILURE(status)) { + if (break_iterator == NULL || U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; } @@ -577,7 +577,7 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) { int32_t sz = 0; UChar *buf = NULL; UErrorCode status = U_ZERO_ERROR; - + buf = python_to_icu(input, &sz, 1); if (buf == NULL) return NULL; ubrk_setText(self->break_iterator, buf, sz, &status); @@ -595,13 +595,13 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) { // BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { -#if PY_VERSION_HEX >= 0x03030000 +#if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL, *needle = NULL; int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0; - + buf = python_to_icu(token, &sz, 1); if (buf == NULL) return NULL; if (sz < 1) goto end; @@ -613,7 +613,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); - if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) + if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) { @@ -655,7 +655,7 @@ end: // BreakIterator.split2 {{{ static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { -#if PY_VERSION_HEX >= 0x03030000 +#if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif @@ -663,14 +663,14 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0; UChar sep = 0; PyObject *ans = NULL, *temp = NULL, *t = NULL; - + ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); - if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) + if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start; if (sz > 0) { @@ -703,12 +703,12 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { } else { sz += leading_hyphen + trailing_hyphen; last_sz = sz; - temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); + temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); if (temp == NULL) { - Py_DECREF(ans); ans = NULL; break; - } + Py_DECREF(ans); ans = NULL; break; + } if (PyList_Append(ans, temp) != 0) { - Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; + Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(temp); } @@ -912,18 +912,18 @@ icu_get_available_transliterators(PyObject *self, PyObject *args) { // character_name {{{ static PyObject * icu_character_name(PyObject *self, PyObject *args) { - char name[512] = {0}; + char name[512] = {0}; int32_t sz = 0, alias = 0; UChar *buf; UErrorCode status = U_ZERO_ERROR; PyObject *palias = NULL, *result = NULL, *input = NULL; UChar32 code = 0; - + if (!PyArg_ParseTuple(args, "O|O", &input, &palias)) return NULL; - if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; + if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; buf = python_to_icu(input, &sz, 1); - if (buf == NULL) goto end; + if (buf == NULL) goto end; U16_GET(buf, 0, 0, sz, code); if (alias) { sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status); @@ -941,16 +941,16 @@ end: // character_name_from_code {{{ static PyObject * icu_character_name_from_code(PyObject *self, PyObject *args) { - char name[512] = {0}; + char name[512] = {0}; int32_t sz, alias = 0; UErrorCode status = U_ZERO_ERROR; PyObject *palias = NULL, *result = NULL; UChar32 code = 0; - + if (!PyArg_ParseTuple(args, "I|O", &code, &palias)) return NULL; - if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; - + if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; + if (alias) { sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status); } else { @@ -969,7 +969,7 @@ icu_chr(PyObject *self, PyObject *args) { UChar32 code = 0; UChar buf[5] = {0}; int32_t sz = 0; - + if (!PyArg_ParseTuple(args, "I", &code)) return NULL; u_strFromUTF32(buf, 4, &sz, &code, 1, &status); @@ -996,26 +996,50 @@ icu_ord_string(PyObject *self, PyObject *input) { end: if (input_buf != NULL) free(input_buf); return ans; - + } // }}} // normalize {{{ +typedef enum { NFC, NFKC, NFD, NFKD } NORM_MODES; + static PyObject * icu_normalize(PyObject *self, PyObject *args) { UErrorCode status = U_ZERO_ERROR; - int32_t sz = 0, mode = UNORM_DEFAULT, cap = 0, rsz = 0; + int32_t sz = 0, cap = 0, rsz = 0; + NORM_MODES mode; UChar *dest = NULL, *source = NULL; PyObject *ret = NULL, *src = NULL; - + if (!PyArg_ParseTuple(args, "iO", &mode, &src)) return NULL; + const UNormalizer2 *n = NULL; + switch (mode) { + case NFC: + n = unorm2_getNFCInstance(&status); + break; + case NFKC: + n = unorm2_getNFKCInstance(&status); + break; + case NFD: + n = unorm2_getNFDInstance(&status); + break; + case NFKD: + n = unorm2_getNFKDInstance(&status); + break; + } + if (U_FAILURE(status)) { + PyErr_SetString(PyExc_ValueError, u_errorName(status)); + goto end; + } + source = python_to_icu(src, &sz, 1); - if (source == NULL) goto end; + if (source == NULL) goto end; cap = 2 * sz; dest = (UChar*) calloc(cap, sizeof(UChar)); if (dest == NULL) { PyErr_NoMemory(); goto end; } + while (1) { - rsz = unorm_normalize(source, sz, (UNormalizationMode)mode, 0, dest, cap, &status); + rsz = unorm2_normalize(n, source, sz, dest, cap, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { cap *= 2; dest = (UChar*) realloc(dest, cap*sizeof(UChar)); @@ -1029,7 +1053,7 @@ icu_normalize(PyObject *self, PyObject *args) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } - + ret = icu_to_python(dest, rsz); end: @@ -1044,7 +1068,7 @@ icu_roundtrip(PyObject *self, PyObject *src) { int32_t sz = 0; UChar *icu = NULL; PyObject *ret = NULL; - + icu = python_to_icu(src, &sz, 1); if (icu != NULL) { ret = icu_to_python(icu, sz); @@ -1071,7 +1095,7 @@ icu_break_iterator_locales(PyObject *self, PyObject *args) { PyTuple_SET_ITEM(ret, i, t); } } - + return ret; } // }}} @@ -1080,7 +1104,7 @@ static PyObject * icu_string_length(PyObject *self, PyObject *src) { int32_t sz = 0; UChar *icu = NULL; - + icu = python_to_icu(src, &sz, 1); if (icu == NULL) return NULL; sz = u_countChar32(icu, sz); @@ -1091,7 +1115,7 @@ icu_string_length(PyObject *self, PyObject *src) { // utf16_length {{{ static PyObject * icu_utf16_length(PyObject *self, PyObject *src) { -#if PY_VERSION_HEX >= 0x03030000 +#if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif @@ -1100,7 +1124,7 @@ icu_utf16_length(PyObject *self, PyObject *src) { int32_t i = 0, t = 0; Py_UNICODE *data = NULL; #endif - + if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "Must be a unicode object"); return NULL; } sz = (int32_t)PyUnicode_GET_SIZE(src); #ifdef Py_UNICODE_WIDE @@ -1135,39 +1159,39 @@ static PyMethodDef icu_methods[] = { "get_available_transliterators() -> Return list of available transliterators. This list is rather limited on OS X." }, - {"character_name", icu_character_name, METH_VARARGS, + {"character_name", icu_character_name, METH_VARARGS, "character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string." }, - {"character_name_from_code", icu_character_name_from_code, METH_VARARGS, + {"character_name_from_code", icu_character_name_from_code, METH_VARARGS, "character_name_from_code(code, alias=False) -> Return the name for the specified unicode code point" }, - {"chr", icu_chr, METH_VARARGS, + {"chr", icu_chr, METH_VARARGS, "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)." }, - {"ord_string", icu_ord_string, METH_O, + {"ord_string", icu_ord_string, METH_O, "ord_string(code) -> Convert a python unicode string to a tuple of unicode codepoints." }, - {"normalize", icu_normalize, METH_VARARGS, + {"normalize", icu_normalize, METH_VARARGS, "normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode." }, - {"roundtrip", icu_roundtrip, METH_O, + {"roundtrip", icu_roundtrip, METH_O, "roundtrip(string) -> Roundtrip a unicode object from python to ICU back to python (useful for testing)" }, - {"available_locales_for_break_iterator", icu_break_iterator_locales, METH_NOARGS, + {"available_locales_for_break_iterator", icu_break_iterator_locales, METH_NOARGS, "available_locales_for_break_iterator() -> Return tuple of all available locales for the BreakIterator" }, - {"string_length", icu_string_length, METH_O, + {"string_length", icu_string_length, METH_O, "string_length(string) -> Return the length of a string (number of unicode code points in the string). Useful on narrow python builds where len() returns an incorrect answer if the string contains surrogate pairs." }, - {"utf16_length", icu_utf16_length, METH_O, + {"utf16_length", icu_utf16_length, METH_O, "utf16_length(string) -> Return the length of a string (number of UTF-16 code points in the string). Useful on wide python builds where len() returns an incorrect answer if the string contains surrogate pairs." }, @@ -1177,7 +1201,7 @@ static PyMethodDef icu_methods[] = { #define ADDUCONST(x) PyModule_AddIntConstant(m, #x, x) CALIBRE_MODINIT_FUNC -initicu(void) +initicu(void) { PyObject* m; UVersionInfo ver, uver; @@ -1232,13 +1256,10 @@ initicu(void) ADDUCONST(UCOL_LOWER_FIRST); ADDUCONST(UCOL_UPPER_FIRST); - ADDUCONST(UNORM_NONE); - ADDUCONST(UNORM_NFD); - ADDUCONST(UNORM_NFKD); - ADDUCONST(UNORM_NFC); - ADDUCONST(UNORM_DEFAULT); - ADDUCONST(UNORM_NFKC); - ADDUCONST(UNORM_FCD); + ADDUCONST(NFD); + ADDUCONST(NFKD); + ADDUCONST(NFC); + ADDUCONST(NFKC); ADDUCONST(UPPER_CASE); ADDUCONST(LOWER_CASE); diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 3efd1f4954..c91b5a6689 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -28,7 +28,7 @@ if _icu is None: raise RuntimeError('Failed to load icu with error: %s' % err) del err icu_unicode_version = getattr(_icu, 'unicode_version', None) -_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')} +_nmodes = {m:getattr(_icu, m) for m in ('NFC', 'NFD', 'NFKC', 'NFKD')} # Ensure that the python internal filesystem and default encodings are not ASCII @@ -38,6 +38,8 @@ def is_ascii(name): return codecs.lookup(name).name == b'ascii' except (TypeError, LookupError): return True + + try: if is_ascii(sys.getdefaultencoding()): _icu.set_default_encoding(b'utf-8') @@ -119,6 +121,7 @@ def case_sensitive_collator(): # function implementations based on different collators, to allow lazy loading # of collators, with maximum runtime performance + _sort_key_template = ''' def {name}(obj): try: @@ -222,6 +225,7 @@ def capitalize(x): except (IndexError, TypeError, AttributeError): return x + try: swapcase = _icu.swap_case except AttributeError: # For people running from source @@ -300,6 +304,7 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x): ans[last_c] = [item] return ans + # Return the number of unicode codepoints in a string string_length = _icu.string_length if is_narrow_build else len @@ -311,4 +316,3 @@ utf16_length = len if is_narrow_build else _icu.utf16_length if __name__ == '__main__': from calibre.utils.icu_test import run run(verbosity=4) - diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h index 6de9e9e9c7..25164283fc 100644 --- a/src/calibre/utils/icu_calibre_utils.h +++ b/src/calibre/utils/icu_calibre_utils.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #if PY_VERSION_HEX >= 0x03030000