From 88e9494e6bb3ed296a640564f2cf89f2a63cef47 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 1 May 2018 09:48:44 +0530
Subject: [PATCH] Replace use of deprecated ICU unorm.h API

---
 src/calibre/utils/icu.c               | 149 +++++++++++++++-----------
 src/calibre/utils/icu.py              |   8 +-
 src/calibre/utils/icu_calibre_utils.h |   2 +-
 3 files changed, 92 insertions(+), 67 deletions(-)

diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c
index 44eba37878..5e7ae07ce2 100644
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@@ -50,7 +50,7 @@ icu_Collator_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 
     if (!PyArg_ParseTuple(args, "s", &loc)) return NULL;
     collator = ucol_open(loc, &status);
-    if (collator == NULL || U_FAILURE(status)) { 
+    if (collator == NULL || U_FAILURE(status)) {
         PyErr_SetString(PyExc_Exception, "Failed to create collator.");
         return NULL;
     }
@@ -144,7 +144,7 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *input) {
     UChar *buf = NULL;
     uint8_t *buf2 = NULL;
     PyObject *ans = NULL;
-  
+
     buf = python_to_icu(input, &sz, 1);
     if (buf == NULL) return NULL;
 
@@ -173,7 +173,7 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args) {
     int32_t asz = 0, bsz = 0;
     UChar *a = NULL, *b = NULL;
     UCollationResult res = UCOL_EQUAL;
-  
+
     if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
 
     a = python_to_icu(a_, &asz, 1);
@@ -182,7 +182,7 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args) {
     if (b == NULL) goto end;
     res = ucol_strcoll(self->collator, a, asz, b, bsz);
 end:
-    if (a != NULL) free(a); 
+    if (a != NULL) free(a);
     if (b != NULL) free(b);
 
     return (PyErr_Occurred()) ? NULL : Py_BuildValue("i", res);
@@ -191,7 +191,7 @@ end:
 // Collator.find {{{
 static PyObject *
 icu_Collator_find(icu_Collator *self, PyObject *args) {
-#if PY_VERSION_HEX >= 0x03030000 
+#if PY_VERSION_HEX >= 0x03030000
 #error Not implemented for python >= 3.3
 #endif
     PyObject *a_ = NULL, *b_ = NULL;
@@ -199,7 +199,7 @@ icu_Collator_find(icu_Collator *self, PyObject *args) {
     int32_t asz = 0, bsz = 0, pos = -1, length = -1;
     UErrorCode status = U_ZERO_ERROR;
     UStringSearch *search = NULL;
-  
+
     if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
 
     a = python_to_icu(a_, &asz, 1);
@@ -238,7 +238,7 @@ icu_Collator_contains(icu_Collator *self, PyObject *args) {
     uint8_t found = 0;
     UErrorCode status = U_ZERO_ERROR;
     UStringSearch *search = NULL;
-  
+
     if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
 
     a = python_to_icu(a_, &asz, 1);
@@ -276,7 +276,7 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args) {
         if (self->contractions == NULL) return PyErr_NoMemory();
         self->contractions = ucol_getTailoredSet(self->collator, &status);
     }
-    status = U_ZERO_ERROR; 
+    status = U_ZERO_ERROR;
     count = uset_getItemCount(self->contractions);
 
     str = (UChar*)calloc(100, sizeof(UChar));
@@ -299,7 +299,7 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args) {
     }
 end:
     if (str != NULL) free(str);
-  
+
     return ans;
 } // }}}
 
@@ -310,7 +310,7 @@ icu_Collator_startswith(icu_Collator *self, PyObject *args) {
     int32_t asz = 0, bsz = 0;
     UChar *a = NULL, *b = NULL;
     uint8_t ans = 0;
-  
+
     if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
 
     a = python_to_icu(a_, &asz, 1);
@@ -320,7 +320,7 @@ icu_Collator_startswith(icu_Collator *self, PyObject *args) {
 
     if (asz < bsz) goto end;
     if (bsz == 0) { ans = 1; goto end; }
-    
+
     ans = ucol_equal(self->collator, a, bsz, b, bsz);
 
 end:
@@ -340,7 +340,7 @@ icu_Collator_collation_order(icu_Collator *self, PyObject *a_) {
     UErrorCode status = U_ZERO_ERROR;
     UCollationElements *iter = NULL;
     int order = 0, len = -1;
-  
+
     a = python_to_icu(a_, &asz, 1);
     if (a == NULL) goto end;
 
@@ -420,17 +420,17 @@ static PyMethodDef icu_Collator_methods[] = {
 };
 
 static PyGetSetDef  icu_Collator_getsetters[] = {
-    {(char *)"actual_locale", 
+    {(char *)"actual_locale",
      (getter)icu_Collator_actual_locale, NULL,
      (char *)"Actual locale used by this collator.",
      NULL},
 
-    {(char *)"capsule", 
+    {(char *)"capsule",
      (getter)icu_Collator_capsule, NULL,
      (char *)"A capsule enclosing the pointer to the ICU collator struct",
      NULL},
 
-    {(char *)"display_name", 
+    {(char *)"display_name",
      (getter)icu_Collator_display_name, NULL,
      (char *)"Display name of this collator in English. The name reflects the actual data source used.",
      NULL},
@@ -557,7 +557,7 @@ icu_BreakIterator_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 
     if (!PyArg_ParseTuple(args, "is", &break_iterator_type, &locale)) return NULL;
     break_iterator = ubrk_open(break_iterator_type, locale, NULL, 0, &status);
-    if (break_iterator == NULL || U_FAILURE(status)) { 
+    if (break_iterator == NULL || U_FAILURE(status)) {
         PyErr_SetString(PyExc_ValueError, u_errorName(status));
         return NULL;
     }
@@ -577,7 +577,7 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) {
     int32_t sz = 0;
     UChar *buf = NULL;
     UErrorCode status = U_ZERO_ERROR;
-  
+
     buf = python_to_icu(input, &sz, 1);
     if (buf == NULL) return NULL;
     ubrk_setText(self->break_iterator, buf, sz, &status);
@@ -595,13 +595,13 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) {
 // BreakIterator.index {{{
 static PyObject *
 icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
-#if PY_VERSION_HEX >= 0x03030000 
+#if PY_VERSION_HEX >= 0x03030000
 #error Not implemented for python >= 3.3
 #endif
 
     UChar *buf = NULL, *needle = NULL;
     int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;
-  
+
     buf = python_to_icu(token, &sz, 1);
     if (buf == NULL) return NULL;
     if (sz < 1) goto end;
@@ -613,7 +613,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
     p = ubrk_first(self->break_iterator);
     while (p != UBRK_DONE) {
         word_start = p; p = ubrk_next(self->break_iterator);
-        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
+        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
             continue;  // We are not at the start of a word
 
         if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) {
@@ -655,7 +655,7 @@ end:
 // BreakIterator.split2 {{{
 static PyObject *
 icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
-#if PY_VERSION_HEX >= 0x03030000 
+#if PY_VERSION_HEX >= 0x03030000
 #error Not implemented for python >= 3.3
 #endif
 
@@ -663,14 +663,14 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
     int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
     UChar sep = 0;
     PyObject *ans = NULL, *temp = NULL, *t = NULL;
-  
+
     ans = PyList_New(0);
     if (ans == NULL) return PyErr_NoMemory();
 
     p = ubrk_first(self->break_iterator);
     while (p != UBRK_DONE) {
         word_start = p; p = ubrk_next(self->break_iterator);
-        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
+        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
             continue;  // We are not at the start of a word
         sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
         if (sz > 0) {
@@ -703,12 +703,12 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
             } else {
                 sz += leading_hyphen + trailing_hyphen;
                 last_sz = sz;
-                temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); 
+                temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz);
                 if (temp == NULL) {
-                    Py_DECREF(ans); ans = NULL; break; 
-                } 
+                    Py_DECREF(ans); ans = NULL; break;
+                }
                 if (PyList_Append(ans, temp) != 0) {
-                    Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; 
+                    Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break;
                 }
                 Py_DECREF(temp);
             }
@@ -912,18 +912,18 @@ icu_get_available_transliterators(PyObject *self, PyObject *args) {
 // character_name {{{
 static PyObject *
 icu_character_name(PyObject *self, PyObject *args) {
-    char name[512] = {0}; 
+    char name[512] = {0};
     int32_t sz = 0, alias = 0;
     UChar *buf;
     UErrorCode status = U_ZERO_ERROR;
     PyObject *palias = NULL, *result = NULL, *input = NULL;
     UChar32 code = 0;
-  
+
     if (!PyArg_ParseTuple(args, "O|O", &input, &palias)) return NULL;
 
-    if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; 
+    if (palias != NULL && PyObject_IsTrue(palias)) alias = 1;
     buf = python_to_icu(input, &sz, 1);
-    if (buf == NULL) goto end; 
+    if (buf == NULL) goto end;
     U16_GET(buf, 0, 0, sz, code);
     if (alias) {
         sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status);
@@ -941,16 +941,16 @@ end:
 // character_name_from_code {{{
 static PyObject *
 icu_character_name_from_code(PyObject *self, PyObject *args) {
-    char name[512] = {0}; 
+    char name[512] = {0};
     int32_t sz, alias = 0;
     UErrorCode status = U_ZERO_ERROR;
     PyObject *palias = NULL, *result = NULL;
     UChar32 code = 0;
-  
+
     if (!PyArg_ParseTuple(args, "I|O", &code, &palias)) return NULL;
 
-    if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; 
-    
+    if (palias != NULL && PyObject_IsTrue(palias)) alias = 1;
+
     if (alias) {
         sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status);
     } else {
@@ -969,7 +969,7 @@ icu_chr(PyObject *self, PyObject *args) {
     UChar32 code = 0;
     UChar buf[5] = {0};
     int32_t sz = 0;
-  
+
     if (!PyArg_ParseTuple(args, "I", &code)) return NULL;
 
     u_strFromUTF32(buf, 4, &sz, &code, 1, &status);
@@ -996,26 +996,50 @@ icu_ord_string(PyObject *self, PyObject *input) {
 end:
     if (input_buf != NULL) free(input_buf);
     return ans;
-  
+
 } // }}}
 
 // normalize {{{
+typedef enum { NFC, NFKC, NFD, NFKD } NORM_MODES;
+
 static PyObject *
 icu_normalize(PyObject *self, PyObject *args) {
     UErrorCode status = U_ZERO_ERROR;
-    int32_t sz = 0, mode = UNORM_DEFAULT, cap = 0, rsz = 0;
+    int32_t sz = 0, cap = 0, rsz = 0;
+    NORM_MODES mode;
     UChar *dest = NULL, *source = NULL;
     PyObject *ret = NULL, *src = NULL;
-  
+
     if (!PyArg_ParseTuple(args, "iO", &mode, &src)) return NULL;
+    const UNormalizer2 *n = NULL;
+    switch (mode) {
+        case NFC:
+            n = unorm2_getNFCInstance(&status);
+            break;
+        case NFKC:
+            n = unorm2_getNFKCInstance(&status);
+            break;
+        case NFD:
+            n = unorm2_getNFDInstance(&status);
+            break;
+        case NFKD:
+            n = unorm2_getNFKDInstance(&status);
+            break;
+    }
+    if (U_FAILURE(status)) {
+        PyErr_SetString(PyExc_ValueError, u_errorName(status));
+        goto end;
+    }
+
     source = python_to_icu(src, &sz, 1);
-    if (source == NULL) goto end; 
+    if (source == NULL) goto end;
     cap = 2 * sz;
     dest = (UChar*) calloc(cap, sizeof(UChar));
     if (dest == NULL) { PyErr_NoMemory(); goto end; }
 
+
     while (1) {
-        rsz = unorm_normalize(source, sz, (UNormalizationMode)mode, 0, dest, cap, &status);
+        rsz = unorm2_normalize(n, source, sz, dest, cap, &status);
         if (status == U_BUFFER_OVERFLOW_ERROR) {
             cap *= 2;
             dest = (UChar*) realloc(dest, cap*sizeof(UChar));
@@ -1029,7 +1053,7 @@ icu_normalize(PyObject *self, PyObject *args) {
         PyErr_SetString(PyExc_ValueError, u_errorName(status));
         goto end;
     }
- 
+
     ret = icu_to_python(dest, rsz);
 
 end:
@@ -1044,7 +1068,7 @@ icu_roundtrip(PyObject *self, PyObject *src) {
     int32_t sz = 0;
     UChar *icu = NULL;
     PyObject *ret = NULL;
-  
+
     icu = python_to_icu(src, &sz, 1);
     if (icu != NULL) {
         ret = icu_to_python(icu, sz);
@@ -1071,7 +1095,7 @@ icu_break_iterator_locales(PyObject *self, PyObject *args) {
             PyTuple_SET_ITEM(ret, i, t);
         }
     }
-  
+
     return ret;
 } // }}}
 
@@ -1080,7 +1104,7 @@ static PyObject *
 icu_string_length(PyObject *self, PyObject *src) {
     int32_t sz = 0;
     UChar *icu = NULL;
-  
+
     icu = python_to_icu(src, &sz, 1);
     if (icu == NULL) return NULL;
     sz = u_countChar32(icu, sz);
@@ -1091,7 +1115,7 @@ icu_string_length(PyObject *self, PyObject *src) {
 // utf16_length {{{
 static PyObject *
 icu_utf16_length(PyObject *self, PyObject *src) {
-#if PY_VERSION_HEX >= 0x03030000 
+#if PY_VERSION_HEX >= 0x03030000
 #error Not implemented for python >= 3.3
 #endif
 
@@ -1100,7 +1124,7 @@ icu_utf16_length(PyObject *self, PyObject *src) {
     int32_t i = 0, t = 0;
     Py_UNICODE *data = NULL;
 #endif
-  
+
     if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "Must be a unicode object"); return NULL; }
     sz = (int32_t)PyUnicode_GET_SIZE(src);
 #ifdef Py_UNICODE_WIDE
@@ -1135,39 +1159,39 @@ static PyMethodDef icu_methods[] = {
         "get_available_transliterators() -> Return list of available transliterators. This list is rather limited on OS X."
     },
 
-    {"character_name", icu_character_name, METH_VARARGS, 
+    {"character_name", icu_character_name, METH_VARARGS,
      "character_name(char, alias=False) -> Return name for the first character in char, which must be a unicode string."
     },
 
-    {"character_name_from_code", icu_character_name_from_code, METH_VARARGS, 
+    {"character_name_from_code", icu_character_name_from_code, METH_VARARGS,
      "character_name_from_code(code, alias=False) -> Return the name for the specified unicode code point"
     },
 
-    {"chr", icu_chr, METH_VARARGS, 
+    {"chr", icu_chr, METH_VARARGS,
      "chr(code) -> Return a python unicode string corresponding to the specified character code. The string can have length 1 or 2 (for non BMP codes on narrow python builds)."
     },
 
-    {"ord_string", icu_ord_string, METH_O, 
+    {"ord_string", icu_ord_string, METH_O,
      "ord_string(code) -> Convert a python unicode string to a tuple of unicode codepoints."
     },
 
-    {"normalize", icu_normalize, METH_VARARGS, 
+    {"normalize", icu_normalize, METH_VARARGS,
      "normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
     },
 
-    {"roundtrip", icu_roundtrip, METH_O, 
+    {"roundtrip", icu_roundtrip, METH_O,
      "roundtrip(string) -> Roundtrip a unicode object from python to ICU back to python (useful for testing)"
     },
 
-    {"available_locales_for_break_iterator", icu_break_iterator_locales, METH_NOARGS, 
+    {"available_locales_for_break_iterator", icu_break_iterator_locales, METH_NOARGS,
      "available_locales_for_break_iterator() -> Return tuple of all available locales for the BreakIterator"
     },
 
-    {"string_length", icu_string_length, METH_O, 
+    {"string_length", icu_string_length, METH_O,
      "string_length(string) -> Return the length of a string (number of unicode code points in the string). Useful on narrow python builds where len() returns an incorrect answer if the string contains surrogate pairs."
     },
 
-    {"utf16_length", icu_utf16_length, METH_O, 
+    {"utf16_length", icu_utf16_length, METH_O,
      "utf16_length(string) -> Return the length of a string (number of UTF-16 code points in the string). Useful on wide python builds where len() returns an incorrect answer if the string contains surrogate pairs."
     },
 
@@ -1177,7 +1201,7 @@ static PyMethodDef icu_methods[] = {
 #define ADDUCONST(x) PyModule_AddIntConstant(m, #x, x)
 
 CALIBRE_MODINIT_FUNC
-initicu(void) 
+initicu(void)
 {
     PyObject* m;
     UVersionInfo ver, uver;
@@ -1232,13 +1256,10 @@ initicu(void)
     ADDUCONST(UCOL_LOWER_FIRST);
     ADDUCONST(UCOL_UPPER_FIRST);
 
-    ADDUCONST(UNORM_NONE);
-    ADDUCONST(UNORM_NFD);
-    ADDUCONST(UNORM_NFKD);
-    ADDUCONST(UNORM_NFC);
-    ADDUCONST(UNORM_DEFAULT);
-    ADDUCONST(UNORM_NFKC);
-    ADDUCONST(UNORM_FCD);
+    ADDUCONST(NFD);
+    ADDUCONST(NFKD);
+    ADDUCONST(NFC);
+    ADDUCONST(NFKC);
 
     ADDUCONST(UPPER_CASE);
     ADDUCONST(LOWER_CASE);
diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py
index 3efd1f4954..c91b5a6689 100644
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@@ -28,7 +28,7 @@ if _icu is None:
     raise RuntimeError('Failed to load icu with error: %s' % err)
 del err
 icu_unicode_version = getattr(_icu, 'unicode_version', None)
-_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
+_nmodes = {m:getattr(_icu, m) for m in ('NFC', 'NFD', 'NFKC', 'NFKD')}
 
 # Ensure that the python internal filesystem and default encodings are not ASCII
 
@@ -38,6 +38,8 @@ def is_ascii(name):
         return codecs.lookup(name).name == b'ascii'
     except (TypeError, LookupError):
         return True
+
+
 try:
     if is_ascii(sys.getdefaultencoding()):
         _icu.set_default_encoding(b'utf-8')
@@ -119,6 +121,7 @@ def case_sensitive_collator():
 # function implementations based on different collators, to allow lazy loading
 # of collators, with maximum runtime performance
 
+
 _sort_key_template = '''
 def {name}(obj):
     try:
@@ -222,6 +225,7 @@ def capitalize(x):
     except (IndexError, TypeError, AttributeError):
         return x
 
+
 try:
     swapcase = _icu.swap_case
 except AttributeError:  # For people running from source
@@ -300,6 +304,7 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x):
             ans[last_c] = [item]
     return ans
 
+
 # Return the number of unicode codepoints in a string
 string_length = _icu.string_length if is_narrow_build else len
 
@@ -311,4 +316,3 @@ utf16_length = len if is_narrow_build else _icu.utf16_length
 if __name__ == '__main__':
     from calibre.utils.icu_test import run
     run(verbosity=4)
-
diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h
index 6de9e9e9c7..25164283fc 100644
--- a/src/calibre/utils/icu_calibre_utils.h
+++ b/src/calibre/utils/icu_calibre_utils.h
@@ -19,7 +19,7 @@
 #include <unicode/ustring.h>
 #include <unicode/usearch.h>
 #include <unicode/utrans.h>
-#include <unicode/unorm.h>
+#include <unicode/unorm2.h>
 #include <unicode/ubrk.h>
 
 #if PY_VERSION_HEX >= 0x03030000