Performance improvements and code cleanup for the ICU module

This commit is contained in:
Kovid Goyal 2014-03-07 21:46:01 +05:30
parent b8e414f18b
commit f078cd7168
5 changed files with 523 additions and 796 deletions

View File

@ -14,13 +14,13 @@ from PyQt4.Qt import (QLineEdit, QAbstractListModel, Qt, pyqtSignal, QObject,
QApplication, QListView, QPoint, QModelIndex, QFont, QFontInfo)
from calibre.constants import isosx, get_osx_version
from calibre.utils.icu import sort_key, primary_startswith, primary_icu_find
from calibre.utils.icu import sort_key, primary_startswith, primary_find
from calibre.gui2 import NONE
from calibre.gui2.widgets import EnComboBox, LineEditECM
from calibre.utils.config import tweaks
def containsq(x, prefix):
return primary_icu_find(prefix, x)[0] != -1
return primary_find(prefix, x)[0] != -1
class CompleteModel(QAbstractListModel): # {{{

View File

@ -113,10 +113,9 @@ def test_ssl():
print ('SSL OK!')
def test_icu():
from calibre.utils.icu import _icu_not_ok, test_roundtrip
if _icu_not_ok:
raise RuntimeError('ICU module not loaded/valid')
test_roundtrip()
print ('Testing ICU')
from calibre.utils.icu_test import test_build
test_build()
print ('ICU OK!')
def test_wpd():

View File

@ -1,5 +1,9 @@
#include "icu_calibre_utils.h"
#define UPPER_CASE 0
#define LOWER_CASE 1
#define TITLE_CASE 2
static PyObject* uchar_to_unicode(const UChar *src, int32_t len) {
wchar_t *buf = NULL;
PyObject *ans = NULL;
@ -66,20 +70,16 @@ icu_Collator_display_name(icu_Collator *self, void *closure) {
const char *loc = NULL;
UErrorCode status = U_ZERO_ERROR;
UChar dname[400];
char buf[100];
int32_t sz = 0;
loc = ucol_getLocaleByType(self->collator, ULOC_ACTUAL_LOCALE, &status);
if (loc == NULL || U_FAILURE(status)) {
if (loc == NULL) {
PyErr_SetString(PyExc_Exception, "Failed to get actual locale"); return NULL;
}
ucol_getDisplayName(loc, "en", dname, 100, &status);
if (U_FAILURE(status)) return PyErr_NoMemory();
sz = ucol_getDisplayName(loc, "en", dname, sizeof(dname), &status);
if (U_FAILURE(status)) {PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; }
u_strToUTF8(buf, 100, NULL, dname, -1, &status);
if (U_FAILURE(status)) {
PyErr_SetString(PyExc_Exception, "Failed to convert dname to UTF-8"); return NULL;
}
return Py_BuildValue("s", buf);
return icu_to_python(dname, sz);
}
// }}}
@ -140,47 +140,29 @@ icu_Collator_capsule(icu_Collator *self, void *closure) {
// Collator.sort_key {{{
static PyObject *
icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
char *input;
int32_t sz;
UChar *buf;
uint8_t *buf2;
PyObject *ans;
int32_t key_size;
UErrorCode status = U_ZERO_ERROR;
int32_t sz = 0, key_size = 0, bsz = 0;
UChar *buf = NULL;
uint8_t *buf2 = NULL;
PyObject *ans = NULL, *input = NULL;
if (!PyArg_ParseTuple(args, "es", "UTF-8", &input)) return NULL;
if (!PyArg_ParseTuple(args, "O", &input)) return NULL;
buf = python_to_icu(input, &sz, 1);
if (buf == NULL) return NULL;
sz = (int32_t)strlen(input);
bsz = 7 * sz + 1;
buf2 = (uint8_t*)calloc(bsz, sizeof(uint8_t));
if (buf2 == NULL) { PyErr_NoMemory(); goto end; }
key_size = ucol_getSortKey(self->collator, buf, sz, buf2, bsz);
if (key_size > bsz) {
buf2 = realloc(buf2, (key_size + 1) * sizeof(uint8_t));
if (buf2 == NULL) { PyErr_NoMemory(); goto end; }
key_size = ucol_getSortKey(self->collator, buf, sz, buf2, key_size + 1);
}
ans = PyBytes_FromStringAndSize((char*)buf2, key_size);
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
if (buf == NULL) return PyErr_NoMemory();
u_strFromUTF8(buf, sz*4 + 1, &key_size, input, sz, &status);
PyMem_Free(input);
if (U_SUCCESS(status)) {
buf2 = (uint8_t*)calloc(7*sz+1, sizeof(uint8_t));
if (buf2 == NULL) return PyErr_NoMemory();
key_size = ucol_getSortKey(self->collator, buf, -1, buf2, 7*sz+1);
if (key_size == 0) {
ans = PyBytes_FromString("");
} else {
if (key_size >= 7*sz+1) {
free(buf2);
buf2 = (uint8_t*)calloc(key_size+1, sizeof(uint8_t));
if (buf2 == NULL) return PyErr_NoMemory();
ucol_getSortKey(self->collator, buf, -1, buf2, key_size+1);
}
ans = PyBytes_FromString((char *)buf2);
}
free(buf2);
} else ans = PyBytes_FromString("");
free(buf);
if (ans == NULL) return PyErr_NoMemory();
end:
if (buf != NULL) free(buf);
if (buf2 != NULL) free(buf2);
return ans;
} // }}}
@ -188,86 +170,64 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
// Collator.strcmp {{{
static PyObject *
icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
char *a_, *b_;
int32_t asz, bsz;
UChar *a, *b;
UErrorCode status = U_ZERO_ERROR;
PyObject *a_ = NULL, *b_ = NULL;
int32_t asz = 0, bsz = 0;
UChar *a = NULL, *b = NULL;
UCollationResult res = UCOL_EQUAL;
if (!PyArg_ParseTuple(args, "eses", "UTF-8", &a_, "UTF-8", &b_)) return NULL;
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
asz = (int32_t)strlen(a_); bsz = (int32_t)strlen(b_);
a = python_to_icu(a_, &asz, 1);
if (a == NULL) goto end;
b = python_to_icu(b_, &bsz, 1);
if (b == NULL) goto end;
res = ucol_strcoll(self->collator, a, asz, b, bsz);
end:
if (a != NULL) free(a); if (b != NULL) free(b);
a = (UChar*)calloc(asz*4 + 1, sizeof(UChar));
b = (UChar*)calloc(bsz*4 + 1, sizeof(UChar));
if (a == NULL || b == NULL) return PyErr_NoMemory();
u_strFromUTF8(a, asz*4 + 1, NULL, a_, asz, &status);
u_strFromUTF8(b, bsz*4 + 1, NULL, b_, bsz, &status);
PyMem_Free(a_); PyMem_Free(b_);
if (U_SUCCESS(status))
res = ucol_strcoll(self->collator, a, -1, b, -1);
free(a); free(b);
return Py_BuildValue("i", res);
return (PyErr_Occurred()) ? NULL : Py_BuildValue("i", res);
} // }}}
// Collator.find {{{
static PyObject *
icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
PyObject *a_, *b_;
int32_t asz, bsz;
UChar *a, *b;
wchar_t *aw, *bw;
PyObject *a_ = NULL, *b_ = NULL;
UChar *a = NULL, *b = NULL;
int32_t asz = 0, bsz = 0, pos = -1, length = -1;
UErrorCode status = U_ZERO_ERROR;
UStringSearch *search = NULL;
int32_t pos = -1, length = -1;
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_);
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar));
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t));
if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory();
PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1);
u_strFromWCS(a, asz*4 + 1, NULL, aw, -1, &status);
u_strFromWCS(b, bsz*4 + 1, NULL, bw, -1, &status);
a = python_to_icu(a_, &asz, 1);
if (a == NULL) goto end;
b = python_to_icu(b_, &bsz, 1);
if (b == NULL) goto end;
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
if (U_SUCCESS(status)) {
search = usearch_openFromCollator(a, -1, b, -1, self->collator, NULL, &status);
if (U_SUCCESS(status)) {
pos = usearch_first(search, &status);
if (pos != USEARCH_DONE)
length = usearch_getMatchedLength(search);
else
pos = -1;
}
if (search != NULL) usearch_close(search);
pos = usearch_first(search, &status);
if (pos != USEARCH_DONE)
length = usearch_getMatchedLength(search);
else
pos = -1;
}
end:
if (search != NULL) usearch_close(search);
if (a != NULL) free(a);
if (b != NULL) free(b);
free(a); free(b); free(aw); free(bw);
return Py_BuildValue("ii", pos, length);
return (PyErr_Occurred()) ? NULL : Py_BuildValue("ii", pos, length);
} // }}}
// Collator.contractions {{{
static PyObject *
icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) {
UErrorCode status = U_ZERO_ERROR;
UChar *str;
UChar *str = NULL;
UChar32 start=0, end=0;
int32_t count = 0, len = 0, dlen = 0, i;
int32_t count = 0, len = 0, i;
PyObject *ans = Py_None, *pbuf;
wchar_t *buf;
if (self->contractions == NULL) {
self->contractions = uset_open(1, 0);
@ -275,107 +235,112 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs)
self->contractions = ucol_getTailoredSet(self->collator, &status);
}
status = U_ZERO_ERROR;
count = uset_getItemCount(self->contractions);
str = (UChar*)calloc(100, sizeof(UChar));
buf = (wchar_t*)calloc(4*100+2, sizeof(wchar_t));
if (str == NULL || buf == NULL) return PyErr_NoMemory();
count = uset_getItemCount(self->contractions);
if (str == NULL) { PyErr_NoMemory(); goto end; }
ans = PyTuple_New(count);
if (ans != NULL) {
for (i = 0; i < count; i++) {
len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
if (len >= 2) {
// We have a string
status = U_ZERO_ERROR;
u_strToWCS(buf, 4*100 + 1, &dlen, str, len, &status);
pbuf = PyUnicode_FromWideChar(buf, dlen);
if (pbuf == NULL) return PyErr_NoMemory();
PyTuple_SetItem(ans, i, pbuf);
} else {
// Ranges dont make sense for contractions, ignore them
PyTuple_SetItem(ans, i, Py_None);
}
if (ans == NULL) { goto end; }
for (i = 0; i < count; i++) {
len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
if (len >= 2) {
// We have a string
status = U_ZERO_ERROR;
pbuf = icu_to_python(str, len);
if (pbuf == NULL) { Py_DECREF(ans); ans = NULL; goto end; }
PyTuple_SetItem(ans, i, pbuf);
} else {
// Ranges dont make sense for contractions, ignore them
PyTuple_SetItem(ans, i, Py_None); Py_INCREF(Py_None);
}
}
free(str); free(buf);
end:
if (str != NULL) free(str);
return Py_BuildValue("O", ans);
return ans;
} // }}}
// Collator.startswith {{{
static PyObject *
icu_Collator_startswith(icu_Collator *self, PyObject *args, PyObject *kwargs) {
PyObject *a_, *b_;
int32_t asz, bsz;
int32_t actual_a, actual_b;
UChar *a, *b;
wchar_t *aw, *bw;
UErrorCode status = U_ZERO_ERROR;
int ans = 0;
PyObject *a_ = NULL, *b_ = NULL;
int32_t asz = 0, bsz = 0;
UChar *a = NULL, *b = NULL;
uint8_t ans = 0;
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_);
if (asz < bsz) Py_RETURN_FALSE;
if (bsz == 0) Py_RETURN_TRUE;
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar));
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t));
a = python_to_icu(a_, &asz, 1);
if (a == NULL) goto end;
b = python_to_icu(b_, &bsz, 1);
if (b == NULL) goto end;
if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory();
if (asz < bsz) goto end;
if (bsz == 0) { ans = 1; goto end; }
actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
actual_b = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1);
if (actual_a > -1 && actual_b > -1) {
u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status);
u_strFromWCS(b, bsz*4 + 1, &actual_b, bw, -1, &status);
ans = ucol_equal(self->collator, a, bsz, b, bsz);
if (U_SUCCESS(status) && ucol_equal(self->collator, a, actual_b, b, actual_b))
ans = 1;
}
end:
if (a != NULL) free(a);
if (b != NULL) free(b);
free(a); free(b); free(aw); free(bw);
if (ans) Py_RETURN_TRUE;
if (PyErr_Occurred()) return NULL;
if (ans) { Py_RETURN_TRUE; }
Py_RETURN_FALSE;
} // }}}
// Collator.startswith {{{
// Collator.collation_order {{{
static PyObject *
icu_Collator_collation_order(icu_Collator *self, PyObject *args, PyObject *kwargs) {
PyObject *a_;
int32_t asz;
int32_t actual_a;
UChar *a;
wchar_t *aw;
PyObject *a_ = NULL;
int32_t asz = 0;
UChar *a = NULL;
UErrorCode status = U_ZERO_ERROR;
UCollationElements *iter = NULL;
int order = 0, len = -1;
if (!PyArg_ParseTuple(args, "U", &a_)) return NULL;
asz = (int32_t)PyUnicode_GetSize(a_);
if (!PyArg_ParseTuple(args, "O", &a_)) return NULL;
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
a = python_to_icu(a_, &asz, 1);
if (a == NULL) goto end;
if (a == NULL || aw == NULL ) return PyErr_NoMemory();
actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
if (actual_a > -1) {
u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status);
iter = ucol_openElements(self->collator, a, actual_a, &status);
if (iter != NULL && U_SUCCESS(status)) {
order = ucol_next(iter, &status);
len = ucol_getOffset(iter);
ucol_closeElements(iter); iter = NULL;
}
}
free(a); free(aw);
iter = ucol_openElements(self->collator, a, asz, &status);
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
order = ucol_next(iter, &status);
len = ucol_getOffset(iter);
end:
if (iter != NULL) ucol_closeElements(iter); iter = NULL;
if (a != NULL) free(a);
if (PyErr_Occurred()) return NULL;
return Py_BuildValue("ii", order, len);
} // }}}
// Collator.upper_first {{{
static PyObject *
icu_Collator_get_upper_first(icu_Collator *self, void *closure) {
UErrorCode status = U_ZERO_ERROR;
UColAttributeValue val;
val = ucol_getAttribute(self->collator, UCOL_CASE_FIRST, &status);
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; }
if (val == UCOL_OFF) { Py_RETURN_NONE; }
if (val) {
Py_RETURN_TRUE;
}
Py_RETURN_FALSE;
}
static int
icu_Collator_set_upper_first(icu_Collator *self, PyObject *val, void *closure) {
UErrorCode status = U_ZERO_ERROR;
ucol_setAttribute(self->collator, UCOL_CASE_FIRST, (val == Py_None) ? UCOL_OFF : ((PyObject_IsTrue(val)) ? UCOL_UPPER_FIRST : UCOL_LOWER_FIRST), &status);
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return -1; }
return 0;
}
// }}}
static PyObject*
icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs);
@ -432,6 +397,11 @@ static PyGetSetDef icu_Collator_getsetters[] = {
(char *)"The strength of this collator.",
NULL},
{(char *)"upper_first",
(getter)icu_Collator_get_upper_first, (setter)icu_Collator_set_upper_first,
(char *)"Whether this collator should always put upper case letters before lower case. Values are: None - means use the tertiary strength of the letters. True - Always sort upper case before lower case. False - Always sort lower case before upper case.",
NULL},
{(char *)"numeric",
(getter)icu_Collator_get_numeric, (setter)icu_Collator_set_numeric,
(char *)"If True the collator sorts contiguous digits as numbers rather than strings, so 2 will sort before 10.",
@ -513,139 +483,45 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs)
// }}}
// upper {{{
static PyObject *
icu_upper(PyObject *self, PyObject *args) {
char *input, *ans, *buf3 = NULL;
const char *loc;
int32_t sz;
UChar *buf, *buf2;
PyObject *ret;
// change_case {{{
static PyObject* icu_change_case(PyObject *self, PyObject *args) {
char *locale = NULL;
PyObject *input = NULL, *result = NULL;
int which = UPPER_CASE;
UErrorCode status = U_ZERO_ERROR;
UChar *input_buf = NULL, *output_buf = NULL;
int32_t sz = 0;
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
sz = (int32_t)strlen(input);
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
u_strToUpper(buf2, sz*8, buf, -1, loc, &status);
ans = input;
sz = u_strlen(buf2);
free(buf);
if (U_SUCCESS(status) && sz > 0) {
buf3 = (char*)calloc(sz*5+1, sizeof(char));
if (buf3 == NULL) return PyErr_NoMemory();
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
if (U_SUCCESS(status)) ans = buf3;
if (!PyArg_ParseTuple(args, "Oiz", &input, &which, &locale)) return NULL;
if (locale == NULL) {
PyErr_SetString(PyExc_NotImplementedError, "You must specify a locale"); // We deliberately use NotImplementedError so that this error can be unambiguously identified
return NULL;
}
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
if (ret == NULL) return PyErr_NoMemory();
input_buf = python_to_icu(input, &sz, 1);
if (input_buf == NULL) goto end;
output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
if (output_buf == NULL) { PyErr_NoMemory(); goto end; }
free(buf2);
if (buf3 != NULL) free(buf3);
PyMem_Free(input);
return ret;
} // }}}
// lower {{{
static PyObject *
icu_lower(PyObject *self, PyObject *args) {
char *input, *ans, *buf3 = NULL;
const char *loc;
int32_t sz;
UChar *buf, *buf2;
PyObject *ret;
UErrorCode status = U_ZERO_ERROR;
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
sz = (int32_t)strlen(input);
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
u_strToLower(buf2, sz*8, buf, -1, loc, &status);
ans = input;
sz = u_strlen(buf2);
free(buf);
if (U_SUCCESS(status) && sz > 0) {
buf3 = (char*)calloc(sz*5+1, sizeof(char));
if (buf3 == NULL) return PyErr_NoMemory();
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
if (U_SUCCESS(status)) ans = buf3;
switch (which) {
case TITLE_CASE:
sz = u_strToTitle(output_buf, 3 * sz, input_buf, sz, NULL, locale, &status);
break;
case UPPER_CASE:
sz = u_strToUpper(output_buf, 3 * sz, input_buf, sz, locale, &status);
break;
default:
sz = u_strToLower(output_buf, 3 * sz, input_buf, sz, locale, &status);
}
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
result = icu_to_python(output_buf, sz);
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
if (ret == NULL) return PyErr_NoMemory();
end:
if (input_buf != NULL) free(input_buf);
if (output_buf != NULL) free(output_buf);
return result;
free(buf2);
if (buf3 != NULL) free(buf3);
PyMem_Free(input);
return ret;
} // }}}
// title {{{
static PyObject *
icu_title(PyObject *self, PyObject *args) {
char *input, *ans, *buf3 = NULL;
const char *loc;
int32_t sz;
UChar *buf, *buf2;
PyObject *ret;
UErrorCode status = U_ZERO_ERROR;
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
sz = (int32_t)strlen(input);
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
u_strToTitle(buf2, sz*8, buf, -1, NULL, loc, &status);
ans = input;
sz = u_strlen(buf2);
free(buf);
if (U_SUCCESS(status) && sz > 0) {
buf3 = (char*)calloc(sz*5+1, sizeof(char));
if (buf3 == NULL) return PyErr_NoMemory();
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
if (U_SUCCESS(status)) ans = buf3;
}
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
if (ret == NULL) return PyErr_NoMemory();
free(buf2);
if (buf3 != NULL) free(buf3);
PyMem_Free(input);
return ret;
} // }}}
// set_default_encoding {{{
@ -662,7 +538,7 @@ icu_set_default_encoding(PyObject *self, PyObject *args) {
}
// }}}
// set_default_encoding {{{
// set_filesystem_encoding {{{
static PyObject *
icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
char *encoding;
@ -674,7 +550,7 @@ icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
}
// }}}
// set_default_encoding {{{
// get_available_transliterators {{{
static PyObject *
icu_get_available_transliterators(PyObject *self, PyObject *args) {
PyObject *ans, *l;
@ -835,16 +711,8 @@ icu_roundtrip(PyObject *self, PyObject *args) {
// Module initialization {{{
static PyMethodDef icu_methods[] = {
{"upper", icu_upper, METH_VARARGS,
"upper(locale, unicode object) -> upper cased unicode object using locale rules."
},
{"lower", icu_lower, METH_VARARGS,
"lower(locale, unicode object) -> lower cased unicode object using locale rules."
},
{"title", icu_title, METH_VARARGS,
"title(locale, unicode object) -> Title cased unicode object using locale rules."
{"change_case", icu_change_case, METH_VARARGS,
"change_case(unicode object, which, locale) -> change case to one of UPPER_CASE, LOWER_CASE, TITLE_CASE"
},
{"set_default_encoding", icu_set_default_encoding, METH_VARARGS,
@ -946,5 +814,9 @@ initicu(void)
ADDUCONST(UNORM_NFKC);
ADDUCONST(UNORM_FCD);
ADDUCONST(UPPER_CASE);
ADDUCONST(LOWER_CASE);
ADDUCONST(TITLE_CASE);
}
// }}}

View File

@ -1,5 +1,7 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
@ -7,232 +9,20 @@ __docformat__ = 'restructuredtext en'
# Setup code {{{
import sys
from functools import partial
from calibre.constants import plugins
from calibre.utils.config_base import tweaks
_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
_locale = None
_locale = _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None
_none = u''
_none2 = b''
def get_locale():
global _locale
if _locale is None:
from calibre.utils.localization import get_lang
if tweaks['locale_for_sorting']:
_locale = tweaks['locale_for_sorting']
else:
_locale = get_lang()
return _locale
def load_icu():
global _icu
if _icu is None:
_icu = plugins['icu'][0]
if _icu is None:
print 'Loading ICU failed with: ', plugins['icu'][1]
else:
if not getattr(_icu, 'ok', False):
print 'icu not ok'
_icu = None
return _icu
def load_collator():
'The default collator for most locales takes both case and accented letters into account'
global _collator
if _collator is None:
icu = load_icu()
if icu is not None:
_collator = icu.Collator(get_locale())
return _collator
def primary_collator():
'Ignores case differences and accented characters'
global _primary_collator
if _primary_collator is None:
_primary_collator = _collator.clone()
_primary_collator.strength = _icu.UCOL_PRIMARY
return _primary_collator
def sort_collator():
'Ignores case differences and recognizes numbers in strings'
global _sort_collator
if _sort_collator is None:
_sort_collator = _collator.clone()
_sort_collator.strength = _icu.UCOL_SECONDARY
if tweaks['numeric_collation']:
try:
_sort_collator.numeric = True
except AttributeError:
pass
return _sort_collator
def py_sort_key(obj):
if not obj:
return _none
return obj.lower()
def icu_sort_key(collator, obj):
if not obj:
return _none2
try:
try:
return _sort_collator.sort_key(obj)
except AttributeError:
return sort_collator().sort_key(obj)
except TypeError:
if isinstance(obj, unicode):
obj = obj.replace(u'\0', u'')
else:
obj = obj.replace(b'\0', b'')
return _sort_collator.sort_key(obj)
def numeric_collator():
global _numeric_collator
_numeric_collator = _collator.clone()
_numeric_collator.strength = _icu.UCOL_SECONDARY
_numeric_collator.numeric = True
return _numeric_collator
def numeric_sort_key(obj):
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
if not obj:
return _none2
try:
try:
return _numeric_collator.sort_key(obj)
except AttributeError:
return numeric_collator().sort_key(obj)
except TypeError:
if isinstance(obj, unicode):
obj = obj.replace(u'\0', u'')
else:
obj = obj.replace(b'\0', b'')
return _numeric_collator.sort_key(obj)
def icu_change_case(upper, locale, obj):
func = _icu.upper if upper else _icu.lower
try:
return func(locale, obj)
except TypeError:
if isinstance(obj, unicode):
obj = obj.replace(u'\0', u'')
else:
obj = obj.replace(b'\0', b'')
return func(locale, obj)
def py_find(pattern, source):
pos = source.find(pattern)
if pos > -1:
return pos, len(pattern)
return -1, -1
def character_name(string):
try:
try:
return _icu.character_name(unicode(string)) or None
except AttributeError:
import unicodedata
return unicodedata.name(unicode(string)[0], None)
except (TypeError, ValueError, KeyError):
pass
def character_name_from_code(code):
try:
try:
return _icu.character_name_from_code(code) or ''
except AttributeError:
import unicodedata
return unicodedata.name(py_safe_chr(code), '')
except (TypeError, ValueError, KeyError):
return ''
if sys.maxunicode >= 0x10ffff:
try:
py_safe_chr = unichr
except NameError:
py_safe_chr = chr
else:
def py_safe_chr(i):
# Narrow builds of python cannot represent code point > 0xffff as a
# single character, so we need our own implementation of unichr
# that returns them as a surrogate pair
return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
def safe_chr(code):
try:
return _icu.chr(code)
except AttributeError:
return py_safe_chr(code)
def normalize(text, mode='NFC'):
# This is very slightly slower than using unicodedata.normalize, so stick with
# that unless you have very good reasons not too. Also, it's speed
# decreases on wide python builds, where conversion to/from ICU's string
# representation is slower.
try:
return _icu.normalize(_nmodes[mode], unicode(text))
except (AttributeError, KeyError):
import unicodedata
return unicodedata.normalize(mode, unicode(text))
def icu_find(collator, pattern, source):
try:
return collator.find(pattern, source)
except TypeError:
return collator.find(unicode(pattern), unicode(source))
def icu_startswith(collator, a, b):
try:
return collator.startswith(a, b)
except TypeError:
return collator.startswith(unicode(a), unicode(b))
def py_case_sensitive_sort_key(obj):
if not obj:
return _none
return obj
def icu_case_sensitive_sort_key(collator, obj):
if not obj:
return _none2
return collator.sort_key(obj)
def icu_strcmp(collator, a, b):
return collator.strcmp(lower(a), lower(b))
def py_strcmp(a, b):
return cmp(a.lower(), b.lower())
def icu_case_sensitive_strcmp(collator, a, b):
return collator.strcmp(a, b)
def icu_capitalize(s):
s = lower(s)
return s.replace(s[0], upper(s[0]), 1) if s else s
_cmap = {}
def icu_contractions(collator):
global _cmap
ans = _cmap.get(collator, None)
if ans is None:
ans = collator.contractions()
ans = frozenset(filter(None, ans)) if ans else {}
_cmap[collator] = ans
return ans
def icu_collation_order(collator, a):
try:
return collator.collation_order(a)
except TypeError:
return collator.collation_order(unicode(a))
load_icu()
load_collator()
_icu_not_ok = _icu is None or _collator is None
_icu, err = plugins['icu']
if _icu is None:
raise RuntimeError('Failed to load icu with error: %s' % err)
del err
icu_unicode_version = getattr(_icu, 'unicode_version', None)
_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
@ -252,290 +42,208 @@ try:
except:
pass
def collator():
global _collator, _locale
if _collator is None:
if _locale is None:
from calibre.utils.localization import get_lang
if tweaks['locale_for_sorting']:
_locale = tweaks['locale_for_sorting']
else:
_locale = get_lang()
try:
_collator = _icu.Collator(_locale)
except Exception as e:
print ('Failed to load collator for locale: %r with error %r, using English' % (_locale, e))
_collator = _icu.Collator('en')
return _collator
def change_locale(locale=None):
global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator
_collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None
_locale = locale
def primary_collator():
'Ignores case differences and accented characters'
global _primary_collator
if _primary_collator is None:
_primary_collator = collator().clone()
_primary_collator.strength = _icu.UCOL_PRIMARY
return _primary_collator
def sort_collator():
'Ignores case differences and recognizes numbers in strings (if the tweak is set)'
global _sort_collator
if _sort_collator is None:
_sort_collator = collator().clone()
_sort_collator.strength = _icu.UCOL_SECONDARY
_sort_collator.numeric = tweaks['numeric_collation']
return _sort_collator
def numeric_collator():
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
global _numeric_collator
if _numeric_collator is None:
_numeric_collator = collator().clone()
_numeric_collator.strength = _icu.UCOL_SECONDARY
_numeric_collator.numeric = True
return _numeric_collator
def case_sensitive_collator():
'Always sorts upper case letter before lower case'
global _case_sensitive_collator
if _case_sensitive_collator is None:
_case_sensitive_collator = collator().clone()
_case_sensitive_collator.numeric = sort_collator().numeric
_case_sensitive_collator.upper_first = True
return _case_sensitive_collator
# Templates that will be used to generate various concrete
# function implementations based on different collators, to allow lazy loading
# of collators, with maximum runtime performance
_sort_key_template = '''
def {name}(obj):
try:
try:
return {collator}.{func}(obj)
except AttributeError:
return {collator_func}().{func}(obj)
except TypeError:
if isinstance(obj, bytes):
try:
obj = obj.decode(sys.getdefaultencoding())
except ValueError:
return obj
return {collator}.{func}(obj)
return b''
'''
_strcmp_template = '''
def {name}(a, b):
try:
try:
return {collator}.{func}(a, b)
except AttributeError:
return {collator_func}().{func}(a, b)
except TypeError:
if isinstance(a, bytes):
try:
a = a.decode(sys.getdefaultencoding())
except ValueError:
return cmp(a, b)
elif a is None:
a = u''
if isinstance(b, bytes):
try:
b = b.decode(sys.getdefaultencoding())
except ValueError:
return cmp(a, b)
elif b is None:
b = u''
return {collator}.{func}(a, b)
'''
_change_case_template = '''
def {name}(x):
try:
try:
return _icu.change_case(x, _icu.{which}, _locale)
except NotImplementedError:
collator() # sets _locale
return _icu.change_case(x, _icu.{which}, _locale)
except TypeError:
if isinstance(x, bytes):
try:
x = x.decode(sys.getdefaultencoding())
except ValueError:
return x
return _icu.change_case(x, _icu.{which}, _locale)
raise
'''
def _make_func(template, name, **kwargs):
l = globals()
kwargs['name'] = name
kwargs['func'] = kwargs.get('func', 'sort_key')
exec template.format(**kwargs) in l
return l[name]
# }}}
################# The string functions ########################################
sort_key = _make_func(_sort_key_template, 'sort_key', collator='_sort_collator', collator_func='sort_collator')
sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
numeric_sort_key = _make_func(_sort_key_template, 'numeric_sort_key', collator='_numeric_collator', collator_func='numeric_collator')
strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
primary_sort_key = _make_func(_sort_key_template, 'primary_sort_key', collator='_primary_collator', collator_func='primary_collator')
case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
partial(icu_case_sensitive_sort_key, _collator)
case_sensitive_sort_key = _make_func(_sort_key_template, 'case_sensitive_sort_key',
collator='_case_sensitive_collator', collator_func='case_sensitive_collator')
case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
collation_order = _make_func(_sort_key_template, 'collation_order', collator='_sort_collator', collator_func='sort_collator', func='collation_order')
upper = (lambda s: s.upper()) if _icu_not_ok else \
partial(icu_change_case, True, get_locale())
strcmp = _make_func(_strcmp_template, 'strcmp', collator='_sort_collator', collator_func='sort_collator', func='strcmp')
lower = (lambda s: s.lower()) if _icu_not_ok else \
partial(icu_change_case, False, get_locale())
case_sensitive_strcmp = _make_func(
_strcmp_template, 'case_sensitive_strcmp', collator='_case_sensitive_collator', collator_func='case_sensitive_collator', func='strcmp')
title_case = (lambda s: s.title()) if _icu_not_ok else \
partial(_icu.title, get_locale())
primary_strcmp = _make_func(_strcmp_template, 'primary_strcmp', collator='_primary_collator', collator_func='primary_collator', func='strcmp')
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
(lambda s: icu_capitalize(s))
upper = _make_func(_change_case_template, 'upper', which='UPPER_CASE')
find = (py_find if _icu_not_ok else partial(icu_find, _collator))
lower = _make_func(_change_case_template, 'lower', which='LOWER_CASE')
contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
_collator)))
title_case = _make_func(_change_case_template, 'title_case', which='TITLE_CASE')
def primary_strcmp(a, b):
'strcmp that ignores case and accents on letters'
if _icu_not_ok:
from calibre.utils.filenames import ascii_text
return py_strcmp(ascii_text(a), ascii_text(b))
capitalize = lambda x: upper(x[0]) + lower(x[1:])
find = _make_func(_strcmp_template, 'find', collator='_collator', collator_func='collator', func='find')
primary_find = _make_func(_strcmp_template, 'primary_find', collator='_primary_collator', collator_func='primary_collator', func='find')
startswith = _make_func(_strcmp_template, 'startswith', collator='_collator', collator_func='collator', func='startswith')
primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator='_primary_collator', collator_func='primary_collator', func='startswith')
safe_chr = _icu.chr
def character_name(string):
try:
return _primary_collator.strcmp(a, b)
except AttributeError:
return primary_collator().strcmp(a, b)
return _icu.character_name(unicode(string)) or None
except (TypeError, ValueError, KeyError):
pass
def primary_find(pat, src):
'find that ignores case and accents on letters'
if _icu_not_ok:
from calibre.utils.filenames import ascii_text
return py_find(ascii_text(pat), ascii_text(src))
return primary_icu_find(pat, src)
def primary_icu_find(pat, src):
def character_name_from_code(code):
try:
return icu_find(_primary_collator, pat, src)
except AttributeError:
return icu_find(primary_collator(), pat, src)
return _icu.character_name_from_code(code) or ''
except (TypeError, ValueError, KeyError):
return ''
def primary_sort_key(val):
'A sort key that ignores case and diacritics'
if _icu_not_ok:
from calibre.utils.filenames import ascii_text
return ascii_text(val).lower()
try:
return _primary_collator.sort_key(val)
except AttributeError:
return primary_collator().sort_key(val)
def normalize(text, mode='NFC'):
# This is very slightly slower than using unicodedata.normalize, so stick with
# that unless you have very good reasons not too. Also, it's speed
# decreases on wide python builds, where conversion to/from ICU's string
# representation is slower.
return _icu.normalize(_nmodes[mode], unicode(text))
def primary_startswith(a, b):
if _icu_not_ok:
from calibre.utils.filenames import ascii_text
return ascii_text(a).lower().startswith(ascii_text(b).lower())
try:
return icu_startswith(_primary_collator, a, b)
except AttributeError:
return icu_startswith(primary_collator(), a, b)
def contractions(col=None):
global _cmap
col = col or _collator
if col is None:
col = collator()
ans = _cmap.get(collator, None)
if ans is None:
ans = col.contractions()
ans = frozenset(filter(None, ans))
_cmap[col] = ans
return ans
def collation_order(a):
if _icu_not_ok:
return (ord(a[0]), 1) if a else (0, 0)
try:
return icu_collation_order(_sort_collator, a)
except AttributeError:
return icu_collation_order(sort_collator(), a)
################################################################################
def test(): # {{{
from calibre import prints
# Data {{{
german = '''
Sonntag
Montag
Dienstag
Januar
Februar
März
Fuße
Fluße
Flusse
flusse
fluße
flüße
flüsse
'''
german_good = '''
Dienstag
Februar
flusse
Flusse
fluße
Fluße
flüsse
flüße
Fuße
Januar
März
Montag
Sonntag'''
french = '''
dimanche
lundi
mardi
janvier
février
mars
déjà
Meme
deja
même
dejà
bpef
bœg
Boef
Mémé
bœf
boef
bnef
pêche
pèché
pêché
pêche
pêché'''
french_good = '''
bnef
boef
Boef
bœf
bœg
bpef
deja
dejà
déjà
dimanche
février
janvier
lundi
mardi
mars
Meme
Mémé
même
pèché
pêche
pêche
pêché
pêché'''
# }}}
def create(l):
l = l.decode('utf-8').splitlines()
return [x.strip() for x in l if x.strip()]
def test_strcmp(entries):
for x in entries:
for y in entries:
if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
print 'strcmp failed for %r, %r'%(x, y)
german = create(german)
c = _icu.Collator('de')
c.numeric = True
gs = list(sorted(german, key=c.sort_key))
if gs != create(german_good):
print 'German sorting failed'
return
print
french = create(french)
c = _icu.Collator('fr')
c.numeric = True
fs = list(sorted(french, key=c.sort_key))
if fs != create(french_good):
print 'French sorting failed (note that French fails with icu < 4.6)'
return
test_strcmp(german + french)
print '\nTesting case transforms in current locale'
from calibre.utils.titlecase import titlecase
for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
print
print '\nTesting primary collation'
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
u'Štepánek':u'ŠtepaneK'}.iteritems():
if primary_strcmp(k, v) != 0:
prints('primary_strcmp() failed with %s != %s'%(k, v))
return
if primary_find(v, u' '+k)[0] != 1:
prints('primary_find() failed with %s not in %s'%(v, k))
return
n = character_name(safe_chr(0x1f431))
if n != u'CAT FACE':
raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
global _primary_collator
orig = _primary_collator
_primary_collator = _icu.Collator('es')
if primary_strcmp(u'peña', u'pena') == 0:
print 'Primary collation in Spanish locale failed'
return
_primary_collator = orig
print '\nTesting contractions'
c = _icu.Collator('cs')
if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
u'S\u030c', u'R\u030c']):
print 'Contractions for the Czech language failed'
return
print '\nTesting startswith'
p = primary_startswith
if (not p('asd', 'asd') or not p('asd', 'A') or
not p('x', '')):
print 'startswith() failed'
return
print '\nTesting collation_order()'
for group in [
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
('calibre', 'Charon', 'Collins'),
('01', '1'),
('1', '11', '13'),
]:
last = None
for x in group:
val = icu_collation_order(sort_collator(), x)
if val[1] != 1:
prints('collation_order() returned incorrect length for', x)
if last is None:
last = val
else:
if val != last:
prints('collation_order() returned incorrect value for', x)
last = val
# }}}
def test_roundtrip():
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
rp = _icu.roundtrip(r)
if rp != r:
raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
def test_normalize_performance():
import os
if not os.path.exists('t.txt'):
return
raw = open('t.txt', 'rb').read().decode('utf-8')
print (len(raw))
import time, unicodedata
st = time.time()
count = 100
for i in xrange(count):
normalize(raw)
print ('ICU time:', time.time() - st)
st = time.time()
for i in xrange(count):
unicodedata.normalize('NFC', unicode(raw))
print ('py time:', time.time() - st)
if __name__ == '__main__':
test_roundtrip()
test_normalize_performance()
test()
from calibre.utils.icu_test import run
run(verbosity=4)

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import unittest, sys
from contextlib import contextmanager
import calibre.utils.icu as icu
@contextmanager
def make_collation_func(name, locale, numeric=True, template='_sort_key_template', func='strcmp'):
c = icu._icu.Collator(locale)
cname = '%s_test_collator%s' % (name, template)
setattr(icu, cname, c)
c.numeric = numeric
yield icu._make_func(getattr(icu, template), name, collator=cname, collator_func='not_used_xxx', func=func)
delattr(icu, cname)
class TestICU(unittest.TestCase):
ae = unittest.TestCase.assertEqual
def setUp(self):
icu.change_locale('en')
def test_sorting(self):
' Test the various sorting APIs '
german = '''Sonntag Montag Dienstag Januar Februar März Fuße Fluße Flusse flusse fluße flüße flüsse'''.split()
german_good = '''Dienstag Februar flusse Flusse fluße Fluße flüsse flüße Fuße Januar März Montag Sonntag'''.split()
french = '''dimanche lundi mardi janvier février mars déjà Meme deja même dejà bpef bœg Boef Mémé bœf boef bnef pêche pèché pêché pêche pêché'''.split()
french_good = '''bnef boef Boef bœf bœg bpef deja dejà déjà dimanche février janvier lundi mardi mars Meme Mémé même pèché pêche pêche pêché pêché'''.split() # noqa
# Test corner cases
sort_key = icu.sort_key
s = '\U0001f431'
self.ae(sort_key(s), sort_key(s.encode(sys.getdefaultencoding())), 'UTF-8 encoded object not correctly decoded to generate sort key')
self.ae(s.encode('utf-16'), s.encode('utf-16'), 'Undecodable bytestring not returned as itself')
self.ae(b'', sort_key(None))
self.ae(0, icu.strcmp(None, b''))
self.ae(0, icu.strcmp(s, s.encode(sys.getdefaultencoding())))
# Test locales
with make_collation_func('dsk', 'de', func='sort_key') as dsk:
self.ae(german_good, sorted(german, key=dsk))
with make_collation_func('dcmp', 'de', template='_strcmp_template') as dcmp:
for x in german:
for y in german:
self.ae(cmp(dsk(x), dsk(y)), dcmp(x, y))
with make_collation_func('fsk', 'fr', func='sort_key') as fsk:
self.ae(french_good, sorted(french, key=fsk))
with make_collation_func('fcmp', 'fr', template='_strcmp_template') as fcmp:
for x in french:
for y in french:
self.ae(cmp(fsk(x), fsk(y)), fcmp(x, y))
with make_collation_func('ssk', 'es', func='sort_key') as ssk:
self.assertNotEqual(ssk('peña'), ssk('pena'))
with make_collation_func('scmp', 'es', template='_strcmp_template') as scmp:
self.assertNotEqual(0, scmp('pena', 'peña'))
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
self.ae(0, icu.primary_strcmp(k, v))
# Test different types of collation
self.ae(icu.primary_sort_key(''), icu.primary_sort_key('aa'))
self.assertLess(icu.numeric_sort_key('something 2'), icu.numeric_sort_key('something 11'))
self.assertLess(icu.case_sensitive_sort_key('A'), icu.case_sensitive_sort_key('a'))
self.ae(0, icu.strcmp('a', 'A'))
self.ae(cmp('a', 'A'), icu.case_sensitive_strcmp('a', 'A'))
self.ae(0, icu.primary_strcmp('ä', 'A'))
def test_change_case(self):
' Test the various ways of changing the case '
from calibre.utils.titlecase import titlecase
# Test corner cases
self.ae('A', icu.upper(b'a'))
for x in ('a', 'Alice\'s code', 'macdonald\'s machIne', '02 the wars'):
self.ae(icu.upper(x), x.upper())
self.ae(icu.lower(x), x.lower())
# ICU's title case algorithm is different from ours, when there are
# capitals inside words
self.ae(icu.title_case(x), titlecase(x).replace('machIne', 'Machine'))
self.ae(icu.capitalize(x), x[0].upper() + x[1:].lower())
def test_find(self):
' Test searching for substrings '
self.ae((1, 1), icu.find(b'a', b'1ab'))
self.ae((1, 2), icu.find('\U0001f431', 'x\U0001f431x'))
self.ae((0, 4), icu.primary_find('pena', 'peña'))
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))
self.assertTrue(icu.startswith(b'abc', b'ab'))
self.assertTrue(icu.startswith('abc', 'abc'))
self.assertFalse(icu.startswith('xyz', 'a'))
self.assertTrue(icu.startswith('xxx', ''))
self.assertTrue(icu.primary_startswith('pena', 'peña'))
def test_collation_order(self):
'Testing collation ordering'
for group in [
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
('01', '1'),
('1', '11', '13'),
]:
last = None
for x in group:
order, length = icu.numeric_collator().collation_order(x)
if last is not None:
self.ae(last, order)
last = order
def test_roundtrip(self):
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
self.ae(r, icu._icu.roundtrip(r))
def test_character_name(self):
self.ae(icu.character_name('\U0001f431'), 'CAT FACE')
def test_contractions(self):
c = icu._icu.Collator('cs')
self.ae(icu.contractions(c), frozenset({u'Z\u030c', u'z\u030c', u'Ch',
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
u'S\u030c', u'R\u030c'}))
class TestRunner(unittest.main):
def createTests(self):
tl = unittest.TestLoader()
self.test = tl.loadTestsFromTestCase(TestICU)
def run(verbosity=4):
TestRunner(verbosity=verbosity, exit=False)
def test_build():
result = TestRunner(verbosity=0, buffer=True, catchbreak=True, failfast=True, argv=sys.argv[:1], exit=False).result
if not result.wasSuccessful():
raise SystemExit(1)
if __name__ == '__main__':
run(verbosity=4)