From f078cd71683ee5589519160700969b6eff8068d8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 7 Mar 2014 21:46:01 +0530 Subject: [PATCH] Performance improvements and code cleanup for the ICU module --- src/calibre/gui2/complete2.py | 4 +- src/calibre/test_build.py | 7 +- src/calibre/utils/icu.c | 494 ++++++++++--------------- src/calibre/utils/icu.py | 666 ++++++++++------------------------ src/calibre/utils/icu_test.py | 148 ++++++++ 5 files changed, 523 insertions(+), 796 deletions(-) create mode 100644 src/calibre/utils/icu_test.py diff --git a/src/calibre/gui2/complete2.py b/src/calibre/gui2/complete2.py index 8aa28069f8..623215a6e6 100644 --- a/src/calibre/gui2/complete2.py +++ b/src/calibre/gui2/complete2.py @@ -14,13 +14,13 @@ from PyQt4.Qt import (QLineEdit, QAbstractListModel, Qt, pyqtSignal, QObject, QApplication, QListView, QPoint, QModelIndex, QFont, QFontInfo) from calibre.constants import isosx, get_osx_version -from calibre.utils.icu import sort_key, primary_startswith, primary_icu_find +from calibre.utils.icu import sort_key, primary_startswith, primary_find from calibre.gui2 import NONE from calibre.gui2.widgets import EnComboBox, LineEditECM from calibre.utils.config import tweaks def containsq(x, prefix): - return primary_icu_find(prefix, x)[0] != -1 + return primary_find(prefix, x)[0] != -1 class CompleteModel(QAbstractListModel): # {{{ diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py index 618626883e..fd9a36df61 100644 --- a/src/calibre/test_build.py +++ b/src/calibre/test_build.py @@ -113,10 +113,9 @@ def test_ssl(): print ('SSL OK!') def test_icu(): - from calibre.utils.icu import _icu_not_ok, test_roundtrip - if _icu_not_ok: - raise RuntimeError('ICU module not loaded/valid') - test_roundtrip() + print ('Testing ICU') + from calibre.utils.icu_test import test_build + test_build() print ('ICU OK!') def test_wpd(): diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 93d66a20a2..d556115c45 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -1,5 +1,9 @@ #include "icu_calibre_utils.h" +#define UPPER_CASE 0 +#define LOWER_CASE 1 +#define TITLE_CASE 2 + static PyObject* uchar_to_unicode(const UChar *src, int32_t len) { wchar_t *buf = NULL; PyObject *ans = NULL; @@ -66,20 +70,16 @@ icu_Collator_display_name(icu_Collator *self, void *closure) { const char *loc = NULL; UErrorCode status = U_ZERO_ERROR; UChar dname[400]; - char buf[100]; + int32_t sz = 0; loc = ucol_getLocaleByType(self->collator, ULOC_ACTUAL_LOCALE, &status); - if (loc == NULL || U_FAILURE(status)) { + if (loc == NULL) { PyErr_SetString(PyExc_Exception, "Failed to get actual locale"); return NULL; } - ucol_getDisplayName(loc, "en", dname, 100, &status); - if (U_FAILURE(status)) return PyErr_NoMemory(); + sz = ucol_getDisplayName(loc, "en", dname, sizeof(dname), &status); + if (U_FAILURE(status)) {PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; } - u_strToUTF8(buf, 100, NULL, dname, -1, &status); - if (U_FAILURE(status)) { - PyErr_SetString(PyExc_Exception, "Failed to convert dname to UTF-8"); return NULL; - } - return Py_BuildValue("s", buf); + return icu_to_python(dname, sz); } // }}} @@ -140,47 +140,29 @@ icu_Collator_capsule(icu_Collator *self, void *closure) { // Collator.sort_key {{{ static PyObject * icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) { - char *input; - int32_t sz; - UChar *buf; - uint8_t *buf2; - PyObject *ans; - int32_t key_size; - UErrorCode status = U_ZERO_ERROR; + int32_t sz = 0, key_size = 0, bsz = 0; + UChar *buf = NULL; + uint8_t *buf2 = NULL; + PyObject *ans = NULL, *input = NULL; - if (!PyArg_ParseTuple(args, "es", "UTF-8", &input)) return NULL; + if (!PyArg_ParseTuple(args, "O", &input)) return NULL; + buf = python_to_icu(input, &sz, 1); + if (buf == NULL) return NULL; - sz = (int32_t)strlen(input); + bsz = 7 * sz + 1; + buf2 = (uint8_t*)calloc(bsz, sizeof(uint8_t)); + if (buf2 == NULL) { PyErr_NoMemory(); goto end; } + key_size = ucol_getSortKey(self->collator, buf, sz, buf2, bsz); + if (key_size > bsz) { + buf2 = realloc(buf2, (key_size + 1) * sizeof(uint8_t)); + if (buf2 == NULL) { PyErr_NoMemory(); goto end; } + key_size = ucol_getSortKey(self->collator, buf, sz, buf2, key_size + 1); + } + ans = PyBytes_FromStringAndSize((char*)buf2, key_size); - buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar)); - - if (buf == NULL) return PyErr_NoMemory(); - - u_strFromUTF8(buf, sz*4 + 1, &key_size, input, sz, &status); - PyMem_Free(input); - - if (U_SUCCESS(status)) { - buf2 = (uint8_t*)calloc(7*sz+1, sizeof(uint8_t)); - if (buf2 == NULL) return PyErr_NoMemory(); - - key_size = ucol_getSortKey(self->collator, buf, -1, buf2, 7*sz+1); - - if (key_size == 0) { - ans = PyBytes_FromString(""); - } else { - if (key_size >= 7*sz+1) { - free(buf2); - buf2 = (uint8_t*)calloc(key_size+1, sizeof(uint8_t)); - if (buf2 == NULL) return PyErr_NoMemory(); - ucol_getSortKey(self->collator, buf, -1, buf2, key_size+1); - } - ans = PyBytes_FromString((char *)buf2); - } - free(buf2); - } else ans = PyBytes_FromString(""); - - free(buf); - if (ans == NULL) return PyErr_NoMemory(); +end: + if (buf != NULL) free(buf); + if (buf2 != NULL) free(buf2); return ans; } // }}} @@ -188,86 +170,64 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) { // Collator.strcmp {{{ static PyObject * icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) { - char *a_, *b_; - int32_t asz, bsz; - UChar *a, *b; - UErrorCode status = U_ZERO_ERROR; + PyObject *a_ = NULL, *b_ = NULL; + int32_t asz = 0, bsz = 0; + UChar *a = NULL, *b = NULL; UCollationResult res = UCOL_EQUAL; - if (!PyArg_ParseTuple(args, "eses", "UTF-8", &a_, "UTF-8", &b_)) return NULL; - - asz = (int32_t)strlen(a_); bsz = (int32_t)strlen(b_); + if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; - a = (UChar*)calloc(asz*4 + 1, sizeof(UChar)); - b = (UChar*)calloc(bsz*4 + 1, sizeof(UChar)); + a = python_to_icu(a_, &asz, 1); + if (a == NULL) goto end; + b = python_to_icu(b_, &bsz, 1); + if (b == NULL) goto end; + res = ucol_strcoll(self->collator, a, asz, b, bsz); +end: + if (a != NULL) free(a); if (b != NULL) free(b); - - if (a == NULL || b == NULL) return PyErr_NoMemory(); - - u_strFromUTF8(a, asz*4 + 1, NULL, a_, asz, &status); - u_strFromUTF8(b, bsz*4 + 1, NULL, b_, bsz, &status); - PyMem_Free(a_); PyMem_Free(b_); - - if (U_SUCCESS(status)) - res = ucol_strcoll(self->collator, a, -1, b, -1); - - free(a); free(b); - - return Py_BuildValue("i", res); + return (PyErr_Occurred()) ? NULL : Py_BuildValue("i", res); } // }}} // Collator.find {{{ static PyObject * icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) { - PyObject *a_, *b_; - int32_t asz, bsz; - UChar *a, *b; - wchar_t *aw, *bw; + PyObject *a_ = NULL, *b_ = NULL; + UChar *a = NULL, *b = NULL; + int32_t asz = 0, bsz = 0, pos = -1, length = -1; UErrorCode status = U_ZERO_ERROR; UStringSearch *search = NULL; - int32_t pos = -1, length = -1; - if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL; - asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_); - - a = (UChar*)calloc(asz*4 + 2, sizeof(UChar)); - b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar)); - aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t)); - bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t)); + if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; - if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory(); - - PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1); - PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1); - u_strFromWCS(a, asz*4 + 1, NULL, aw, -1, &status); - u_strFromWCS(b, bsz*4 + 1, NULL, bw, -1, &status); + a = python_to_icu(a_, &asz, 1); + if (a == NULL) goto end; + b = python_to_icu(b_, &bsz, 1); + if (b == NULL) goto end; + search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status); if (U_SUCCESS(status)) { - search = usearch_openFromCollator(a, -1, b, -1, self->collator, NULL, &status); - if (U_SUCCESS(status)) { - pos = usearch_first(search, &status); - if (pos != USEARCH_DONE) - length = usearch_getMatchedLength(search); - else - pos = -1; - } - if (search != NULL) usearch_close(search); + pos = usearch_first(search, &status); + if (pos != USEARCH_DONE) + length = usearch_getMatchedLength(search); + else + pos = -1; } +end: + if (search != NULL) usearch_close(search); + if (a != NULL) free(a); + if (b != NULL) free(b); - free(a); free(b); free(aw); free(bw); - - return Py_BuildValue("ii", pos, length); + return (PyErr_Occurred()) ? NULL : Py_BuildValue("ii", pos, length); } // }}} // Collator.contractions {{{ static PyObject * icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) { UErrorCode status = U_ZERO_ERROR; - UChar *str; + UChar *str = NULL; UChar32 start=0, end=0; - int32_t count = 0, len = 0, dlen = 0, i; + int32_t count = 0, len = 0, i; PyObject *ans = Py_None, *pbuf; - wchar_t *buf; if (self->contractions == NULL) { self->contractions = uset_open(1, 0); @@ -275,107 +235,112 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) self->contractions = ucol_getTailoredSet(self->collator, &status); } status = U_ZERO_ERROR; + count = uset_getItemCount(self->contractions); str = (UChar*)calloc(100, sizeof(UChar)); - buf = (wchar_t*)calloc(4*100+2, sizeof(wchar_t)); - if (str == NULL || buf == NULL) return PyErr_NoMemory(); - - count = uset_getItemCount(self->contractions); + if (str == NULL) { PyErr_NoMemory(); goto end; } ans = PyTuple_New(count); - if (ans != NULL) { - for (i = 0; i < count; i++) { - len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status); - if (len >= 2) { - // We have a string - status = U_ZERO_ERROR; - u_strToWCS(buf, 4*100 + 1, &dlen, str, len, &status); - pbuf = PyUnicode_FromWideChar(buf, dlen); - if (pbuf == NULL) return PyErr_NoMemory(); - PyTuple_SetItem(ans, i, pbuf); - } else { - // Ranges dont make sense for contractions, ignore them - PyTuple_SetItem(ans, i, Py_None); - } + if (ans == NULL) { goto end; } + + for (i = 0; i < count; i++) { + len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status); + if (len >= 2) { + // We have a string + status = U_ZERO_ERROR; + pbuf = icu_to_python(str, len); + if (pbuf == NULL) { Py_DECREF(ans); ans = NULL; goto end; } + PyTuple_SetItem(ans, i, pbuf); + } else { + // Ranges dont make sense for contractions, ignore them + PyTuple_SetItem(ans, i, Py_None); Py_INCREF(Py_None); } } - free(str); free(buf); +end: + if (str != NULL) free(str); - return Py_BuildValue("O", ans); + return ans; } // }}} // Collator.startswith {{{ static PyObject * icu_Collator_startswith(icu_Collator *self, PyObject *args, PyObject *kwargs) { - PyObject *a_, *b_; - int32_t asz, bsz; - int32_t actual_a, actual_b; - UChar *a, *b; - wchar_t *aw, *bw; - UErrorCode status = U_ZERO_ERROR; - int ans = 0; + PyObject *a_ = NULL, *b_ = NULL; + int32_t asz = 0, bsz = 0; + UChar *a = NULL, *b = NULL; + uint8_t ans = 0; - if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL; - asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_); - if (asz < bsz) Py_RETURN_FALSE; - if (bsz == 0) Py_RETURN_TRUE; + if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; + + a = python_to_icu(a_, &asz, 1); + if (a == NULL) goto end; + b = python_to_icu(b_, &bsz, 1); + if (b == NULL) goto end; + + if (asz < bsz) goto end; + if (bsz == 0) { ans = 1; goto end; } - a = (UChar*)calloc(asz*4 + 2, sizeof(UChar)); - b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar)); - aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t)); - bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t)); + ans = ucol_equal(self->collator, a, bsz, b, bsz); - if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory(); +end: + if (a != NULL) free(a); + if (b != NULL) free(b); - actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1); - actual_b = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1); - if (actual_a > -1 && actual_b > -1) { - u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status); - u_strFromWCS(b, bsz*4 + 1, &actual_b, bw, -1, &status); - - if (U_SUCCESS(status) && ucol_equal(self->collator, a, actual_b, b, actual_b)) - ans = 1; - } - - free(a); free(b); free(aw); free(bw); - if (ans) Py_RETURN_TRUE; + if (PyErr_Occurred()) return NULL; + if (ans) { Py_RETURN_TRUE; } Py_RETURN_FALSE; } // }}} -// Collator.startswith {{{ +// Collator.collation_order {{{ static PyObject * icu_Collator_collation_order(icu_Collator *self, PyObject *args, PyObject *kwargs) { - PyObject *a_; - int32_t asz; - int32_t actual_a; - UChar *a; - wchar_t *aw; + PyObject *a_ = NULL; + int32_t asz = 0; + UChar *a = NULL; UErrorCode status = U_ZERO_ERROR; UCollationElements *iter = NULL; int order = 0, len = -1; - if (!PyArg_ParseTuple(args, "U", &a_)) return NULL; - asz = (int32_t)PyUnicode_GetSize(a_); - - a = (UChar*)calloc(asz*4 + 2, sizeof(UChar)); - aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t)); + if (!PyArg_ParseTuple(args, "O", &a_)) return NULL; - if (a == NULL || aw == NULL ) return PyErr_NoMemory(); + a = python_to_icu(a_, &asz, 1); + if (a == NULL) goto end; - actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1); - if (actual_a > -1) { - u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status); - iter = ucol_openElements(self->collator, a, actual_a, &status); - if (iter != NULL && U_SUCCESS(status)) { - order = ucol_next(iter, &status); - len = ucol_getOffset(iter); - ucol_closeElements(iter); iter = NULL; - } - } - - free(a); free(aw); + iter = ucol_openElements(self->collator, a, asz, &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } + order = ucol_next(iter, &status); + len = ucol_getOffset(iter); +end: + if (iter != NULL) ucol_closeElements(iter); iter = NULL; + if (a != NULL) free(a); + if (PyErr_Occurred()) return NULL; return Py_BuildValue("ii", order, len); } // }}} +// Collator.upper_first {{{ +static PyObject * +icu_Collator_get_upper_first(icu_Collator *self, void *closure) { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue val; + + val = ucol_getAttribute(self->collator, UCOL_CASE_FIRST, &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; } + + if (val == UCOL_OFF) { Py_RETURN_NONE; } + if (val) { + Py_RETURN_TRUE; + } + Py_RETURN_FALSE; +} + +static int +icu_Collator_set_upper_first(icu_Collator *self, PyObject *val, void *closure) { + UErrorCode status = U_ZERO_ERROR; + ucol_setAttribute(self->collator, UCOL_CASE_FIRST, (val == Py_None) ? UCOL_OFF : ((PyObject_IsTrue(val)) ? UCOL_UPPER_FIRST : UCOL_LOWER_FIRST), &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return -1; } + return 0; +} +// }}} + static PyObject* icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs); @@ -432,6 +397,11 @@ static PyGetSetDef icu_Collator_getsetters[] = { (char *)"The strength of this collator.", NULL}, + {(char *)"upper_first", + (getter)icu_Collator_get_upper_first, (setter)icu_Collator_set_upper_first, + (char *)"Whether this collator should always put upper case letters before lower case. Values are: None - means use the tertiary strength of the letters. True - Always sort upper case before lower case. False - Always sort lower case before upper case.", + NULL}, + {(char *)"numeric", (getter)icu_Collator_get_numeric, (setter)icu_Collator_set_numeric, (char *)"If True the collator sorts contiguous digits as numbers rather than strings, so 2 will sort before 10.", @@ -513,139 +483,45 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs) // }}} -// upper {{{ -static PyObject * -icu_upper(PyObject *self, PyObject *args) { - char *input, *ans, *buf3 = NULL; - const char *loc; - int32_t sz; - UChar *buf, *buf2; - PyObject *ret; +// change_case {{{ + +static PyObject* icu_change_case(PyObject *self, PyObject *args) { + char *locale = NULL; + PyObject *input = NULL, *result = NULL; + int which = UPPER_CASE; UErrorCode status = U_ZERO_ERROR; - + UChar *input_buf = NULL, *output_buf = NULL; + int32_t sz = 0; - if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL; - - sz = (int32_t)strlen(input); - - buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar)); - buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar)); - - - if (buf == NULL || buf2 == NULL) return PyErr_NoMemory(); - - u_strFromUTF8(buf, sz*4, NULL, input, sz, &status); - u_strToUpper(buf2, sz*8, buf, -1, loc, &status); - - ans = input; - sz = u_strlen(buf2); - free(buf); - - if (U_SUCCESS(status) && sz > 0) { - buf3 = (char*)calloc(sz*5+1, sizeof(char)); - if (buf3 == NULL) return PyErr_NoMemory(); - u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status); - if (U_SUCCESS(status)) ans = buf3; + if (!PyArg_ParseTuple(args, "Oiz", &input, &which, &locale)) return NULL; + if (locale == NULL) { + PyErr_SetString(PyExc_NotImplementedError, "You must specify a locale"); // We deliberately use NotImplementedError so that this error can be unambiguously identified + return NULL; } - ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace"); - if (ret == NULL) return PyErr_NoMemory(); + input_buf = python_to_icu(input, &sz, 1); + if (input_buf == NULL) goto end; + output_buf = (UChar*) calloc(3 * sz, sizeof(UChar)); + if (output_buf == NULL) { PyErr_NoMemory(); goto end; } - free(buf2); - if (buf3 != NULL) free(buf3); - PyMem_Free(input); - - return ret; -} // }}} - -// lower {{{ -static PyObject * -icu_lower(PyObject *self, PyObject *args) { - char *input, *ans, *buf3 = NULL; - const char *loc; - int32_t sz; - UChar *buf, *buf2; - PyObject *ret; - UErrorCode status = U_ZERO_ERROR; - - - if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL; - - sz = (int32_t)strlen(input); - - buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar)); - buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar)); - - - if (buf == NULL || buf2 == NULL) return PyErr_NoMemory(); - - u_strFromUTF8(buf, sz*4, NULL, input, sz, &status); - u_strToLower(buf2, sz*8, buf, -1, loc, &status); - - ans = input; - sz = u_strlen(buf2); - free(buf); - - if (U_SUCCESS(status) && sz > 0) { - buf3 = (char*)calloc(sz*5+1, sizeof(char)); - if (buf3 == NULL) return PyErr_NoMemory(); - u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status); - if (U_SUCCESS(status)) ans = buf3; + switch (which) { + case TITLE_CASE: + sz = u_strToTitle(output_buf, 3 * sz, input_buf, sz, NULL, locale, &status); + break; + case UPPER_CASE: + sz = u_strToUpper(output_buf, 3 * sz, input_buf, sz, locale, &status); + break; + default: + sz = u_strToLower(output_buf, 3 * sz, input_buf, sz, locale, &status); } + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } + result = icu_to_python(output_buf, sz); - ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace"); - if (ret == NULL) return PyErr_NoMemory(); +end: + if (input_buf != NULL) free(input_buf); + if (output_buf != NULL) free(output_buf); + return result; - free(buf2); - if (buf3 != NULL) free(buf3); - PyMem_Free(input); - - return ret; -} // }}} - -// title {{{ -static PyObject * -icu_title(PyObject *self, PyObject *args) { - char *input, *ans, *buf3 = NULL; - const char *loc; - int32_t sz; - UChar *buf, *buf2; - PyObject *ret; - UErrorCode status = U_ZERO_ERROR; - - - if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL; - - sz = (int32_t)strlen(input); - - buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar)); - buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar)); - - - if (buf == NULL || buf2 == NULL) return PyErr_NoMemory(); - - u_strFromUTF8(buf, sz*4, NULL, input, sz, &status); - u_strToTitle(buf2, sz*8, buf, -1, NULL, loc, &status); - - ans = input; - sz = u_strlen(buf2); - free(buf); - - if (U_SUCCESS(status) && sz > 0) { - buf3 = (char*)calloc(sz*5+1, sizeof(char)); - if (buf3 == NULL) return PyErr_NoMemory(); - u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status); - if (U_SUCCESS(status)) ans = buf3; - } - - ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace"); - if (ret == NULL) return PyErr_NoMemory(); - - free(buf2); - if (buf3 != NULL) free(buf3); - PyMem_Free(input); - - return ret; } // }}} // set_default_encoding {{{ @@ -662,7 +538,7 @@ icu_set_default_encoding(PyObject *self, PyObject *args) { } // }}} -// set_default_encoding {{{ +// set_filesystem_encoding {{{ static PyObject * icu_set_filesystem_encoding(PyObject *self, PyObject *args) { char *encoding; @@ -674,7 +550,7 @@ icu_set_filesystem_encoding(PyObject *self, PyObject *args) { } // }}} -// set_default_encoding {{{ +// get_available_transliterators {{{ static PyObject * icu_get_available_transliterators(PyObject *self, PyObject *args) { PyObject *ans, *l; @@ -835,16 +711,8 @@ icu_roundtrip(PyObject *self, PyObject *args) { // Module initialization {{{ static PyMethodDef icu_methods[] = { - {"upper", icu_upper, METH_VARARGS, - "upper(locale, unicode object) -> upper cased unicode object using locale rules." - }, - - {"lower", icu_lower, METH_VARARGS, - "lower(locale, unicode object) -> lower cased unicode object using locale rules." - }, - - {"title", icu_title, METH_VARARGS, - "title(locale, unicode object) -> Title cased unicode object using locale rules." + {"change_case", icu_change_case, METH_VARARGS, + "change_case(unicode object, which, locale) -> change case to one of UPPER_CASE, LOWER_CASE, TITLE_CASE" }, {"set_default_encoding", icu_set_default_encoding, METH_VARARGS, @@ -946,5 +814,9 @@ initicu(void) ADDUCONST(UNORM_NFKC); ADDUCONST(UNORM_FCD); + ADDUCONST(UPPER_CASE); + ADDUCONST(LOWER_CASE); + ADDUCONST(TITLE_CASE); + } // }}} diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 39256f6fd6..df4c369365 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -1,5 +1,7 @@ #!/usr/bin/env python -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' @@ -7,232 +9,20 @@ __docformat__ = 'restructuredtext en' # Setup code {{{ import sys -from functools import partial from calibre.constants import plugins from calibre.utils.config_base import tweaks -_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None -_locale = None +_locale = _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None _none = u'' _none2 = b'' - -def get_locale(): - global _locale - if _locale is None: - from calibre.utils.localization import get_lang - if tweaks['locale_for_sorting']: - _locale = tweaks['locale_for_sorting'] - else: - _locale = get_lang() - return _locale - -def load_icu(): - global _icu - if _icu is None: - _icu = plugins['icu'][0] - if _icu is None: - print 'Loading ICU failed with: ', plugins['icu'][1] - else: - if not getattr(_icu, 'ok', False): - print 'icu not ok' - _icu = None - return _icu - -def load_collator(): - 'The default collator for most locales takes both case and accented letters into account' - global _collator - if _collator is None: - icu = load_icu() - if icu is not None: - _collator = icu.Collator(get_locale()) - return _collator - -def primary_collator(): - 'Ignores case differences and accented characters' - global _primary_collator - if _primary_collator is None: - _primary_collator = _collator.clone() - _primary_collator.strength = _icu.UCOL_PRIMARY - return _primary_collator - -def sort_collator(): - 'Ignores case differences and recognizes numbers in strings' - global _sort_collator - if _sort_collator is None: - _sort_collator = _collator.clone() - _sort_collator.strength = _icu.UCOL_SECONDARY - if tweaks['numeric_collation']: - try: - _sort_collator.numeric = True - except AttributeError: - pass - return _sort_collator - -def py_sort_key(obj): - if not obj: - return _none - return obj.lower() - -def icu_sort_key(collator, obj): - if not obj: - return _none2 - try: - try: - return _sort_collator.sort_key(obj) - except AttributeError: - return sort_collator().sort_key(obj) - except TypeError: - if isinstance(obj, unicode): - obj = obj.replace(u'\0', u'') - else: - obj = obj.replace(b'\0', b'') - return _sort_collator.sort_key(obj) - -def numeric_collator(): - global _numeric_collator - _numeric_collator = _collator.clone() - _numeric_collator.strength = _icu.UCOL_SECONDARY - _numeric_collator.numeric = True - return _numeric_collator - -def numeric_sort_key(obj): - 'Uses natural sorting for numbers inside strings so something2 will sort before something10' - if not obj: - return _none2 - try: - try: - return _numeric_collator.sort_key(obj) - except AttributeError: - return numeric_collator().sort_key(obj) - except TypeError: - if isinstance(obj, unicode): - obj = obj.replace(u'\0', u'') - else: - obj = obj.replace(b'\0', b'') - return _numeric_collator.sort_key(obj) - -def icu_change_case(upper, locale, obj): - func = _icu.upper if upper else _icu.lower - try: - return func(locale, obj) - except TypeError: - if isinstance(obj, unicode): - obj = obj.replace(u'\0', u'') - else: - obj = obj.replace(b'\0', b'') - return func(locale, obj) - -def py_find(pattern, source): - pos = source.find(pattern) - if pos > -1: - return pos, len(pattern) - return -1, -1 - -def character_name(string): - try: - try: - return _icu.character_name(unicode(string)) or None - except AttributeError: - import unicodedata - return unicodedata.name(unicode(string)[0], None) - except (TypeError, ValueError, KeyError): - pass - -def character_name_from_code(code): - try: - try: - return _icu.character_name_from_code(code) or '' - except AttributeError: - import unicodedata - return unicodedata.name(py_safe_chr(code), '') - except (TypeError, ValueError, KeyError): - return '' - -if sys.maxunicode >= 0x10ffff: - try: - py_safe_chr = unichr - except NameError: - py_safe_chr = chr -else: - def py_safe_chr(i): - # Narrow builds of python cannot represent code point > 0xffff as a - # single character, so we need our own implementation of unichr - # that returns them as a surrogate pair - return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape') - -def safe_chr(code): - try: - return _icu.chr(code) - except AttributeError: - return py_safe_chr(code) - -def normalize(text, mode='NFC'): - # This is very slightly slower than using unicodedata.normalize, so stick with - # that unless you have very good reasons not too. Also, it's speed - # decreases on wide python builds, where conversion to/from ICU's string - # representation is slower. - try: - return _icu.normalize(_nmodes[mode], unicode(text)) - except (AttributeError, KeyError): - import unicodedata - return unicodedata.normalize(mode, unicode(text)) - -def icu_find(collator, pattern, source): - try: - return collator.find(pattern, source) - except TypeError: - return collator.find(unicode(pattern), unicode(source)) - -def icu_startswith(collator, a, b): - try: - return collator.startswith(a, b) - except TypeError: - return collator.startswith(unicode(a), unicode(b)) - -def py_case_sensitive_sort_key(obj): - if not obj: - return _none - return obj - -def icu_case_sensitive_sort_key(collator, obj): - if not obj: - return _none2 - return collator.sort_key(obj) - -def icu_strcmp(collator, a, b): - return collator.strcmp(lower(a), lower(b)) - -def py_strcmp(a, b): - return cmp(a.lower(), b.lower()) - -def icu_case_sensitive_strcmp(collator, a, b): - return collator.strcmp(a, b) - -def icu_capitalize(s): - s = lower(s) - return s.replace(s[0], upper(s[0]), 1) if s else s - _cmap = {} -def icu_contractions(collator): - global _cmap - ans = _cmap.get(collator, None) - if ans is None: - ans = collator.contractions() - ans = frozenset(filter(None, ans)) if ans else {} - _cmap[collator] = ans - return ans -def icu_collation_order(collator, a): - try: - return collator.collation_order(a) - except TypeError: - return collator.collation_order(unicode(a)) - -load_icu() -load_collator() -_icu_not_ok = _icu is None or _collator is None +_icu, err = plugins['icu'] +if _icu is None: + raise RuntimeError('Failed to load icu with error: %s' % err) +del err icu_unicode_version = getattr(_icu, 'unicode_version', None) _nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')} @@ -252,290 +42,208 @@ try: except: pass +def collator(): + global _collator, _locale + if _collator is None: + if _locale is None: + from calibre.utils.localization import get_lang + if tweaks['locale_for_sorting']: + _locale = tweaks['locale_for_sorting'] + else: + _locale = get_lang() + try: + _collator = _icu.Collator(_locale) + except Exception as e: + print ('Failed to load collator for locale: %r with error %r, using English' % (_locale, e)) + _collator = _icu.Collator('en') + return _collator + +def change_locale(locale=None): + global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator + _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None + _locale = locale + +def primary_collator(): + 'Ignores case differences and accented characters' + global _primary_collator + if _primary_collator is None: + _primary_collator = collator().clone() + _primary_collator.strength = _icu.UCOL_PRIMARY + return _primary_collator + +def sort_collator(): + 'Ignores case differences and recognizes numbers in strings (if the tweak is set)' + global _sort_collator + if _sort_collator is None: + _sort_collator = collator().clone() + _sort_collator.strength = _icu.UCOL_SECONDARY + _sort_collator.numeric = tweaks['numeric_collation'] + return _sort_collator + +def numeric_collator(): + 'Uses natural sorting for numbers inside strings so something2 will sort before something10' + global _numeric_collator + if _numeric_collator is None: + _numeric_collator = collator().clone() + _numeric_collator.strength = _icu.UCOL_SECONDARY + _numeric_collator.numeric = True + return _numeric_collator + +def case_sensitive_collator(): + 'Always sorts upper case letter before lower case' + global _case_sensitive_collator + if _case_sensitive_collator is None: + _case_sensitive_collator = collator().clone() + _case_sensitive_collator.numeric = sort_collator().numeric + _case_sensitive_collator.upper_first = True + return _case_sensitive_collator + +# Templates that will be used to generate various concrete +# function implementations based on different collators, to allow lazy loading +# of collators, with maximum runtime performance + +_sort_key_template = ''' +def {name}(obj): + try: + try: + return {collator}.{func}(obj) + except AttributeError: + return {collator_func}().{func}(obj) + except TypeError: + if isinstance(obj, bytes): + try: + obj = obj.decode(sys.getdefaultencoding()) + except ValueError: + return obj + return {collator}.{func}(obj) + return b'' +''' + +_strcmp_template = ''' +def {name}(a, b): + try: + try: + return {collator}.{func}(a, b) + except AttributeError: + return {collator_func}().{func}(a, b) + except TypeError: + if isinstance(a, bytes): + try: + a = a.decode(sys.getdefaultencoding()) + except ValueError: + return cmp(a, b) + elif a is None: + a = u'' + if isinstance(b, bytes): + try: + b = b.decode(sys.getdefaultencoding()) + except ValueError: + return cmp(a, b) + elif b is None: + b = u'' + return {collator}.{func}(a, b) +''' + +_change_case_template = ''' +def {name}(x): + try: + try: + return _icu.change_case(x, _icu.{which}, _locale) + except NotImplementedError: + collator() # sets _locale + return _icu.change_case(x, _icu.{which}, _locale) + except TypeError: + if isinstance(x, bytes): + try: + x = x.decode(sys.getdefaultencoding()) + except ValueError: + return x + return _icu.change_case(x, _icu.{which}, _locale) + raise +''' + +def _make_func(template, name, **kwargs): + l = globals() + kwargs['name'] = name + kwargs['func'] = kwargs.get('func', 'sort_key') + exec template.format(**kwargs) in l + return l[name] + # }}} ################# The string functions ######################################## +sort_key = _make_func(_sort_key_template, 'sort_key', collator='_sort_collator', collator_func='sort_collator') -sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator) +numeric_sort_key = _make_func(_sort_key_template, 'numeric_sort_key', collator='_numeric_collator', collator_func='numeric_collator') -strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator) +primary_sort_key = _make_func(_sort_key_template, 'primary_sort_key', collator='_primary_collator', collator_func='primary_collator') -case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \ - partial(icu_case_sensitive_sort_key, _collator) +case_sensitive_sort_key = _make_func(_sort_key_template, 'case_sensitive_sort_key', + collator='_case_sensitive_collator', collator_func='case_sensitive_collator') -case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp +collation_order = _make_func(_sort_key_template, 'collation_order', collator='_sort_collator', collator_func='sort_collator', func='collation_order') -upper = (lambda s: s.upper()) if _icu_not_ok else \ - partial(icu_change_case, True, get_locale()) +strcmp = _make_func(_strcmp_template, 'strcmp', collator='_sort_collator', collator_func='sort_collator', func='strcmp') -lower = (lambda s: s.lower()) if _icu_not_ok else \ - partial(icu_change_case, False, get_locale()) +case_sensitive_strcmp = _make_func( + _strcmp_template, 'case_sensitive_strcmp', collator='_case_sensitive_collator', collator_func='case_sensitive_collator', func='strcmp') -title_case = (lambda s: s.title()) if _icu_not_ok else \ - partial(_icu.title, get_locale()) +primary_strcmp = _make_func(_strcmp_template, 'primary_strcmp', collator='_primary_collator', collator_func='primary_collator', func='strcmp') -capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \ - (lambda s: icu_capitalize(s)) +upper = _make_func(_change_case_template, 'upper', which='UPPER_CASE') -find = (py_find if _icu_not_ok else partial(icu_find, _collator)) +lower = _make_func(_change_case_template, 'lower', which='LOWER_CASE') -contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions, - _collator))) +title_case = _make_func(_change_case_template, 'title_case', which='TITLE_CASE') -def primary_strcmp(a, b): - 'strcmp that ignores case and accents on letters' - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return py_strcmp(ascii_text(a), ascii_text(b)) +capitalize = lambda x: upper(x[0]) + lower(x[1:]) + +find = _make_func(_strcmp_template, 'find', collator='_collator', collator_func='collator', func='find') + +primary_find = _make_func(_strcmp_template, 'primary_find', collator='_primary_collator', collator_func='primary_collator', func='find') + +startswith = _make_func(_strcmp_template, 'startswith', collator='_collator', collator_func='collator', func='startswith') + +primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator='_primary_collator', collator_func='primary_collator', func='startswith') + +safe_chr = _icu.chr + +def character_name(string): try: - return _primary_collator.strcmp(a, b) - except AttributeError: - return primary_collator().strcmp(a, b) + return _icu.character_name(unicode(string)) or None + except (TypeError, ValueError, KeyError): + pass -def primary_find(pat, src): - 'find that ignores case and accents on letters' - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return py_find(ascii_text(pat), ascii_text(src)) - return primary_icu_find(pat, src) - -def primary_icu_find(pat, src): +def character_name_from_code(code): try: - return icu_find(_primary_collator, pat, src) - except AttributeError: - return icu_find(primary_collator(), pat, src) + return _icu.character_name_from_code(code) or '' + except (TypeError, ValueError, KeyError): + return '' -def primary_sort_key(val): - 'A sort key that ignores case and diacritics' - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return ascii_text(val).lower() - try: - return _primary_collator.sort_key(val) - except AttributeError: - return primary_collator().sort_key(val) +def normalize(text, mode='NFC'): + # This is very slightly slower than using unicodedata.normalize, so stick with + # that unless you have very good reasons not too. Also, it's speed + # decreases on wide python builds, where conversion to/from ICU's string + # representation is slower. + return _icu.normalize(_nmodes[mode], unicode(text)) -def primary_startswith(a, b): - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return ascii_text(a).lower().startswith(ascii_text(b).lower()) - try: - return icu_startswith(_primary_collator, a, b) - except AttributeError: - return icu_startswith(primary_collator(), a, b) +def contractions(col=None): + global _cmap + col = col or _collator + if col is None: + col = collator() + ans = _cmap.get(collator, None) + if ans is None: + ans = col.contractions() + ans = frozenset(filter(None, ans)) + _cmap[col] = ans + return ans -def collation_order(a): - if _icu_not_ok: - return (ord(a[0]), 1) if a else (0, 0) - try: - return icu_collation_order(_sort_collator, a) - except AttributeError: - return icu_collation_order(sort_collator(), a) ################################################################################ -def test(): # {{{ - from calibre import prints - # Data {{{ - german = ''' - Sonntag -Montag -Dienstag -Januar -Februar -März -Fuße -Fluße -Flusse -flusse -fluße -flüße -flüsse -''' - german_good = ''' - Dienstag -Februar -flusse -Flusse -fluße -Fluße -flüsse -flüße -Fuße -Januar -März -Montag -Sonntag''' - french = ''' -dimanche -lundi -mardi -janvier -février -mars -déjà -Meme -deja -même -dejà -bpef -bœg -Boef -Mémé -bœf -boef -bnef -pêche -pèché -pêché -pêche -pêché''' - french_good = ''' - bnef - boef - Boef - bœf - bœg - bpef - deja - dejà - déjà - dimanche - février - janvier - lundi - mardi - mars - Meme - Mémé - même - pèché - pêche - pêche - pêché - pêché''' - # }}} - - def create(l): - l = l.decode('utf-8').splitlines() - return [x.strip() for x in l if x.strip()] - - def test_strcmp(entries): - for x in entries: - for y in entries: - if strcmp(x, y) != cmp(sort_key(x), sort_key(y)): - print 'strcmp failed for %r, %r'%(x, y) - - german = create(german) - c = _icu.Collator('de') - c.numeric = True - gs = list(sorted(german, key=c.sort_key)) - if gs != create(german_good): - print 'German sorting failed' - return - print - french = create(french) - c = _icu.Collator('fr') - c.numeric = True - fs = list(sorted(french, key=c.sort_key)) - if fs != create(french_good): - print 'French sorting failed (note that French fails with icu < 4.6)' - return - test_strcmp(german + french) - - print '\nTesting case transforms in current locale' - from calibre.utils.titlecase import titlecase - for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'): - print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8') - print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8') - print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8') - print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8') - print - - print '\nTesting primary collation' - for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', - u'Štepánek':u'ŠtepaneK'}.iteritems(): - if primary_strcmp(k, v) != 0: - prints('primary_strcmp() failed with %s != %s'%(k, v)) - return - if primary_find(v, u' '+k)[0] != 1: - prints('primary_find() failed with %s not in %s'%(v, k)) - return - - n = character_name(safe_chr(0x1f431)) - if n != u'CAT FACE': - raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE') - - global _primary_collator - orig = _primary_collator - _primary_collator = _icu.Collator('es') - if primary_strcmp(u'peña', u'pena') == 0: - print 'Primary collation in Spanish locale failed' - return - _primary_collator = orig - - print '\nTesting contractions' - c = _icu.Collator('cs') - if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch', - u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH', - u'S\u030c', u'R\u030c']): - print 'Contractions for the Czech language failed' - return - - print '\nTesting startswith' - p = primary_startswith - if (not p('asd', 'asd') or not p('asd', 'A') or - not p('x', '')): - print 'startswith() failed' - return - - print '\nTesting collation_order()' - for group in [ - ('Šaa', 'Smith', 'Solženicyn', 'Štepánek'), - ('calibre', 'Charon', 'Collins'), - ('01', '1'), - ('1', '11', '13'), - ]: - last = None - for x in group: - val = icu_collation_order(sort_collator(), x) - if val[1] != 1: - prints('collation_order() returned incorrect length for', x) - if last is None: - last = val - else: - if val != last: - prints('collation_order() returned incorrect value for', x) - last = val - -# }}} - -def test_roundtrip(): - for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'): - rp = _icu.roundtrip(r) - if rp != r: - raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp)) - -def test_normalize_performance(): - import os - if not os.path.exists('t.txt'): - return - raw = open('t.txt', 'rb').read().decode('utf-8') - print (len(raw)) - import time, unicodedata - st = time.time() - count = 100 - for i in xrange(count): - normalize(raw) - print ('ICU time:', time.time() - st) - st = time.time() - for i in xrange(count): - unicodedata.normalize('NFC', unicode(raw)) - print ('py time:', time.time() - st) - if __name__ == '__main__': - test_roundtrip() - test_normalize_performance() - test() + from calibre.utils.icu_test import run + run(verbosity=4) diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py new file mode 100644 index 0000000000..e96397e86a --- /dev/null +++ b/src/calibre/utils/icu_test.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +import unittest, sys +from contextlib import contextmanager + +import calibre.utils.icu as icu + + +@contextmanager +def make_collation_func(name, locale, numeric=True, template='_sort_key_template', func='strcmp'): + c = icu._icu.Collator(locale) + cname = '%s_test_collator%s' % (name, template) + setattr(icu, cname, c) + c.numeric = numeric + yield icu._make_func(getattr(icu, template), name, collator=cname, collator_func='not_used_xxx', func=func) + delattr(icu, cname) + +class TestICU(unittest.TestCase): + + ae = unittest.TestCase.assertEqual + + def setUp(self): + icu.change_locale('en') + + def test_sorting(self): + ' Test the various sorting APIs ' + german = '''Sonntag Montag Dienstag Januar Februar März Fuße Fluße Flusse flusse fluße flüße flüsse'''.split() + german_good = '''Dienstag Februar flusse Flusse fluße Fluße flüsse flüße Fuße Januar März Montag Sonntag'''.split() + french = '''dimanche lundi mardi janvier février mars déjà Meme deja même dejà bpef bœg Boef Mémé bœf boef bnef pêche pèché pêché pêche pêché'''.split() + french_good = '''bnef boef Boef bœf bœg bpef deja dejà déjà dimanche février janvier lundi mardi mars Meme Mémé même pèché pêche pêche pêché pêché'''.split() # noqa + + # Test corner cases + sort_key = icu.sort_key + s = '\U0001f431' + self.ae(sort_key(s), sort_key(s.encode(sys.getdefaultencoding())), 'UTF-8 encoded object not correctly decoded to generate sort key') + self.ae(s.encode('utf-16'), s.encode('utf-16'), 'Undecodable bytestring not returned as itself') + self.ae(b'', sort_key(None)) + self.ae(0, icu.strcmp(None, b'')) + self.ae(0, icu.strcmp(s, s.encode(sys.getdefaultencoding()))) + + # Test locales + with make_collation_func('dsk', 'de', func='sort_key') as dsk: + self.ae(german_good, sorted(german, key=dsk)) + with make_collation_func('dcmp', 'de', template='_strcmp_template') as dcmp: + for x in german: + for y in german: + self.ae(cmp(dsk(x), dsk(y)), dcmp(x, y)) + + with make_collation_func('fsk', 'fr', func='sort_key') as fsk: + self.ae(french_good, sorted(french, key=fsk)) + with make_collation_func('fcmp', 'fr', template='_strcmp_template') as fcmp: + for x in french: + for y in french: + self.ae(cmp(fsk(x), fsk(y)), fcmp(x, y)) + + with make_collation_func('ssk', 'es', func='sort_key') as ssk: + self.assertNotEqual(ssk('peña'), ssk('pena')) + with make_collation_func('scmp', 'es', template='_strcmp_template') as scmp: + self.assertNotEqual(0, scmp('pena', 'peña')) + + for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems(): + self.ae(0, icu.primary_strcmp(k, v)) + + # Test different types of collation + self.ae(icu.primary_sort_key('Aä'), icu.primary_sort_key('aa')) + self.assertLess(icu.numeric_sort_key('something 2'), icu.numeric_sort_key('something 11')) + self.assertLess(icu.case_sensitive_sort_key('A'), icu.case_sensitive_sort_key('a')) + self.ae(0, icu.strcmp('a', 'A')) + self.ae(cmp('a', 'A'), icu.case_sensitive_strcmp('a', 'A')) + self.ae(0, icu.primary_strcmp('ä', 'A')) + + def test_change_case(self): + ' Test the various ways of changing the case ' + from calibre.utils.titlecase import titlecase + # Test corner cases + self.ae('A', icu.upper(b'a')) + + for x in ('a', 'Alice\'s code', 'macdonald\'s machIne', '02 the wars'): + self.ae(icu.upper(x), x.upper()) + self.ae(icu.lower(x), x.lower()) + # ICU's title case algorithm is different from ours, when there are + # capitals inside words + self.ae(icu.title_case(x), titlecase(x).replace('machIne', 'Machine')) + self.ae(icu.capitalize(x), x[0].upper() + x[1:].lower()) + + def test_find(self): + ' Test searching for substrings ' + self.ae((1, 1), icu.find(b'a', b'1ab')) + self.ae((1, 2), icu.find('\U0001f431', 'x\U0001f431x')) + self.ae((0, 4), icu.primary_find('pena', 'peña')) + for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems(): + self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k)) + self.assertTrue(icu.startswith(b'abc', b'ab')) + self.assertTrue(icu.startswith('abc', 'abc')) + self.assertFalse(icu.startswith('xyz', 'a')) + self.assertTrue(icu.startswith('xxx', '')) + self.assertTrue(icu.primary_startswith('pena', 'peña')) + + def test_collation_order(self): + 'Testing collation ordering' + for group in [ + ('Šaa', 'Smith', 'Solženicyn', 'Štepánek'), + ('01', '1'), + ('1', '11', '13'), + ]: + last = None + for x in group: + order, length = icu.numeric_collator().collation_order(x) + if last is not None: + self.ae(last, order) + last = order + + def test_roundtrip(self): + for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'): + self.ae(r, icu._icu.roundtrip(r)) + + def test_character_name(self): + self.ae(icu.character_name('\U0001f431'), 'CAT FACE') + + def test_contractions(self): + c = icu._icu.Collator('cs') + self.ae(icu.contractions(c), frozenset({u'Z\u030c', u'z\u030c', u'Ch', + u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH', + u'S\u030c', u'R\u030c'})) + +class TestRunner(unittest.main): + + def createTests(self): + tl = unittest.TestLoader() + self.test = tl.loadTestsFromTestCase(TestICU) + +def run(verbosity=4): + TestRunner(verbosity=verbosity, exit=False) + +def test_build(): + result = TestRunner(verbosity=0, buffer=True, catchbreak=True, failfast=True, argv=sys.argv[:1], exit=False).result + if not result.wasSuccessful(): + raise SystemExit(1) + +if __name__ == '__main__': + run(verbosity=4) +