mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Performance improvements and code cleanup for the ICU module
This commit is contained in:
parent
b8e414f18b
commit
f078cd7168
@ -14,13 +14,13 @@ from PyQt4.Qt import (QLineEdit, QAbstractListModel, Qt, pyqtSignal, QObject,
|
||||
QApplication, QListView, QPoint, QModelIndex, QFont, QFontInfo)
|
||||
|
||||
from calibre.constants import isosx, get_osx_version
|
||||
from calibre.utils.icu import sort_key, primary_startswith, primary_icu_find
|
||||
from calibre.utils.icu import sort_key, primary_startswith, primary_find
|
||||
from calibre.gui2 import NONE
|
||||
from calibre.gui2.widgets import EnComboBox, LineEditECM
|
||||
from calibre.utils.config import tweaks
|
||||
|
||||
def containsq(x, prefix):
|
||||
return primary_icu_find(prefix, x)[0] != -1
|
||||
return primary_find(prefix, x)[0] != -1
|
||||
|
||||
class CompleteModel(QAbstractListModel): # {{{
|
||||
|
||||
|
@ -113,10 +113,9 @@ def test_ssl():
|
||||
print ('SSL OK!')
|
||||
|
||||
def test_icu():
|
||||
from calibre.utils.icu import _icu_not_ok, test_roundtrip
|
||||
if _icu_not_ok:
|
||||
raise RuntimeError('ICU module not loaded/valid')
|
||||
test_roundtrip()
|
||||
print ('Testing ICU')
|
||||
from calibre.utils.icu_test import test_build
|
||||
test_build()
|
||||
print ('ICU OK!')
|
||||
|
||||
def test_wpd():
|
||||
|
@ -1,5 +1,9 @@
|
||||
#include "icu_calibre_utils.h"
|
||||
|
||||
#define UPPER_CASE 0
|
||||
#define LOWER_CASE 1
|
||||
#define TITLE_CASE 2
|
||||
|
||||
static PyObject* uchar_to_unicode(const UChar *src, int32_t len) {
|
||||
wchar_t *buf = NULL;
|
||||
PyObject *ans = NULL;
|
||||
@ -66,20 +70,16 @@ icu_Collator_display_name(icu_Collator *self, void *closure) {
|
||||
const char *loc = NULL;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UChar dname[400];
|
||||
char buf[100];
|
||||
int32_t sz = 0;
|
||||
|
||||
loc = ucol_getLocaleByType(self->collator, ULOC_ACTUAL_LOCALE, &status);
|
||||
if (loc == NULL || U_FAILURE(status)) {
|
||||
if (loc == NULL) {
|
||||
PyErr_SetString(PyExc_Exception, "Failed to get actual locale"); return NULL;
|
||||
}
|
||||
ucol_getDisplayName(loc, "en", dname, 100, &status);
|
||||
if (U_FAILURE(status)) return PyErr_NoMemory();
|
||||
sz = ucol_getDisplayName(loc, "en", dname, sizeof(dname), &status);
|
||||
if (U_FAILURE(status)) {PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; }
|
||||
|
||||
u_strToUTF8(buf, 100, NULL, dname, -1, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
PyErr_SetString(PyExc_Exception, "Failed to convert dname to UTF-8"); return NULL;
|
||||
}
|
||||
return Py_BuildValue("s", buf);
|
||||
return icu_to_python(dname, sz);
|
||||
}
|
||||
|
||||
// }}}
|
||||
@ -140,47 +140,29 @@ icu_Collator_capsule(icu_Collator *self, void *closure) {
|
||||
// Collator.sort_key {{{
|
||||
static PyObject *
|
||||
icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||
char *input;
|
||||
int32_t sz;
|
||||
UChar *buf;
|
||||
uint8_t *buf2;
|
||||
PyObject *ans;
|
||||
int32_t key_size;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t sz = 0, key_size = 0, bsz = 0;
|
||||
UChar *buf = NULL;
|
||||
uint8_t *buf2 = NULL;
|
||||
PyObject *ans = NULL, *input = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "es", "UTF-8", &input)) return NULL;
|
||||
if (!PyArg_ParseTuple(args, "O", &input)) return NULL;
|
||||
buf = python_to_icu(input, &sz, 1);
|
||||
if (buf == NULL) return NULL;
|
||||
|
||||
sz = (int32_t)strlen(input);
|
||||
bsz = 7 * sz + 1;
|
||||
buf2 = (uint8_t*)calloc(bsz, sizeof(uint8_t));
|
||||
if (buf2 == NULL) { PyErr_NoMemory(); goto end; }
|
||||
key_size = ucol_getSortKey(self->collator, buf, sz, buf2, bsz);
|
||||
if (key_size > bsz) {
|
||||
buf2 = realloc(buf2, (key_size + 1) * sizeof(uint8_t));
|
||||
if (buf2 == NULL) { PyErr_NoMemory(); goto end; }
|
||||
key_size = ucol_getSortKey(self->collator, buf, sz, buf2, key_size + 1);
|
||||
}
|
||||
ans = PyBytes_FromStringAndSize((char*)buf2, key_size);
|
||||
|
||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
||||
|
||||
if (buf == NULL) return PyErr_NoMemory();
|
||||
|
||||
u_strFromUTF8(buf, sz*4 + 1, &key_size, input, sz, &status);
|
||||
PyMem_Free(input);
|
||||
|
||||
if (U_SUCCESS(status)) {
|
||||
buf2 = (uint8_t*)calloc(7*sz+1, sizeof(uint8_t));
|
||||
if (buf2 == NULL) return PyErr_NoMemory();
|
||||
|
||||
key_size = ucol_getSortKey(self->collator, buf, -1, buf2, 7*sz+1);
|
||||
|
||||
if (key_size == 0) {
|
||||
ans = PyBytes_FromString("");
|
||||
} else {
|
||||
if (key_size >= 7*sz+1) {
|
||||
free(buf2);
|
||||
buf2 = (uint8_t*)calloc(key_size+1, sizeof(uint8_t));
|
||||
if (buf2 == NULL) return PyErr_NoMemory();
|
||||
ucol_getSortKey(self->collator, buf, -1, buf2, key_size+1);
|
||||
}
|
||||
ans = PyBytes_FromString((char *)buf2);
|
||||
}
|
||||
free(buf2);
|
||||
} else ans = PyBytes_FromString("");
|
||||
|
||||
free(buf);
|
||||
if (ans == NULL) return PyErr_NoMemory();
|
||||
end:
|
||||
if (buf != NULL) free(buf);
|
||||
if (buf2 != NULL) free(buf2);
|
||||
|
||||
return ans;
|
||||
} // }}}
|
||||
@ -188,86 +170,64 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||
// Collator.strcmp {{{
|
||||
static PyObject *
|
||||
icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||
char *a_, *b_;
|
||||
int32_t asz, bsz;
|
||||
UChar *a, *b;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
PyObject *a_ = NULL, *b_ = NULL;
|
||||
int32_t asz = 0, bsz = 0;
|
||||
UChar *a = NULL, *b = NULL;
|
||||
UCollationResult res = UCOL_EQUAL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "eses", "UTF-8", &a_, "UTF-8", &b_)) return NULL;
|
||||
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
|
||||
|
||||
asz = (int32_t)strlen(a_); bsz = (int32_t)strlen(b_);
|
||||
a = python_to_icu(a_, &asz, 1);
|
||||
if (a == NULL) goto end;
|
||||
b = python_to_icu(b_, &bsz, 1);
|
||||
if (b == NULL) goto end;
|
||||
res = ucol_strcoll(self->collator, a, asz, b, bsz);
|
||||
end:
|
||||
if (a != NULL) free(a); if (b != NULL) free(b);
|
||||
|
||||
a = (UChar*)calloc(asz*4 + 1, sizeof(UChar));
|
||||
b = (UChar*)calloc(bsz*4 + 1, sizeof(UChar));
|
||||
|
||||
|
||||
if (a == NULL || b == NULL) return PyErr_NoMemory();
|
||||
|
||||
u_strFromUTF8(a, asz*4 + 1, NULL, a_, asz, &status);
|
||||
u_strFromUTF8(b, bsz*4 + 1, NULL, b_, bsz, &status);
|
||||
PyMem_Free(a_); PyMem_Free(b_);
|
||||
|
||||
if (U_SUCCESS(status))
|
||||
res = ucol_strcoll(self->collator, a, -1, b, -1);
|
||||
|
||||
free(a); free(b);
|
||||
|
||||
return Py_BuildValue("i", res);
|
||||
return (PyErr_Occurred()) ? NULL : Py_BuildValue("i", res);
|
||||
} // }}}
|
||||
|
||||
// Collator.find {{{
|
||||
static PyObject *
|
||||
icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||
PyObject *a_, *b_;
|
||||
int32_t asz, bsz;
|
||||
UChar *a, *b;
|
||||
wchar_t *aw, *bw;
|
||||
PyObject *a_ = NULL, *b_ = NULL;
|
||||
UChar *a = NULL, *b = NULL;
|
||||
int32_t asz = 0, bsz = 0, pos = -1, length = -1;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UStringSearch *search = NULL;
|
||||
int32_t pos = -1, length = -1;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
|
||||
asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_);
|
||||
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
|
||||
|
||||
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
|
||||
b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar));
|
||||
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
|
||||
bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t));
|
||||
|
||||
if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory();
|
||||
|
||||
PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
|
||||
PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1);
|
||||
u_strFromWCS(a, asz*4 + 1, NULL, aw, -1, &status);
|
||||
u_strFromWCS(b, bsz*4 + 1, NULL, bw, -1, &status);
|
||||
a = python_to_icu(a_, &asz, 1);
|
||||
if (a == NULL) goto end;
|
||||
b = python_to_icu(b_, &bsz, 1);
|
||||
if (b == NULL) goto end;
|
||||
|
||||
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
search = usearch_openFromCollator(a, -1, b, -1, self->collator, NULL, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
pos = usearch_first(search, &status);
|
||||
if (pos != USEARCH_DONE)
|
||||
length = usearch_getMatchedLength(search);
|
||||
else
|
||||
pos = -1;
|
||||
}
|
||||
if (search != NULL) usearch_close(search);
|
||||
pos = usearch_first(search, &status);
|
||||
if (pos != USEARCH_DONE)
|
||||
length = usearch_getMatchedLength(search);
|
||||
else
|
||||
pos = -1;
|
||||
}
|
||||
end:
|
||||
if (search != NULL) usearch_close(search);
|
||||
if (a != NULL) free(a);
|
||||
if (b != NULL) free(b);
|
||||
|
||||
free(a); free(b); free(aw); free(bw);
|
||||
|
||||
return Py_BuildValue("ii", pos, length);
|
||||
return (PyErr_Occurred()) ? NULL : Py_BuildValue("ii", pos, length);
|
||||
} // }}}
|
||||
|
||||
// Collator.contractions {{{
|
||||
static PyObject *
|
||||
icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UChar *str;
|
||||
UChar *str = NULL;
|
||||
UChar32 start=0, end=0;
|
||||
int32_t count = 0, len = 0, dlen = 0, i;
|
||||
int32_t count = 0, len = 0, i;
|
||||
PyObject *ans = Py_None, *pbuf;
|
||||
wchar_t *buf;
|
||||
|
||||
if (self->contractions == NULL) {
|
||||
self->contractions = uset_open(1, 0);
|
||||
@ -275,107 +235,112 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs)
|
||||
self->contractions = ucol_getTailoredSet(self->collator, &status);
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
count = uset_getItemCount(self->contractions);
|
||||
|
||||
str = (UChar*)calloc(100, sizeof(UChar));
|
||||
buf = (wchar_t*)calloc(4*100+2, sizeof(wchar_t));
|
||||
if (str == NULL || buf == NULL) return PyErr_NoMemory();
|
||||
|
||||
count = uset_getItemCount(self->contractions);
|
||||
if (str == NULL) { PyErr_NoMemory(); goto end; }
|
||||
ans = PyTuple_New(count);
|
||||
if (ans != NULL) {
|
||||
for (i = 0; i < count; i++) {
|
||||
len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
|
||||
if (len >= 2) {
|
||||
// We have a string
|
||||
status = U_ZERO_ERROR;
|
||||
u_strToWCS(buf, 4*100 + 1, &dlen, str, len, &status);
|
||||
pbuf = PyUnicode_FromWideChar(buf, dlen);
|
||||
if (pbuf == NULL) return PyErr_NoMemory();
|
||||
PyTuple_SetItem(ans, i, pbuf);
|
||||
} else {
|
||||
// Ranges dont make sense for contractions, ignore them
|
||||
PyTuple_SetItem(ans, i, Py_None);
|
||||
}
|
||||
if (ans == NULL) { goto end; }
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
|
||||
if (len >= 2) {
|
||||
// We have a string
|
||||
status = U_ZERO_ERROR;
|
||||
pbuf = icu_to_python(str, len);
|
||||
if (pbuf == NULL) { Py_DECREF(ans); ans = NULL; goto end; }
|
||||
PyTuple_SetItem(ans, i, pbuf);
|
||||
} else {
|
||||
// Ranges dont make sense for contractions, ignore them
|
||||
PyTuple_SetItem(ans, i, Py_None); Py_INCREF(Py_None);
|
||||
}
|
||||
}
|
||||
free(str); free(buf);
|
||||
end:
|
||||
if (str != NULL) free(str);
|
||||
|
||||
return Py_BuildValue("O", ans);
|
||||
return ans;
|
||||
} // }}}
|
||||
|
||||
// Collator.startswith {{{
|
||||
static PyObject *
|
||||
icu_Collator_startswith(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||
PyObject *a_, *b_;
|
||||
int32_t asz, bsz;
|
||||
int32_t actual_a, actual_b;
|
||||
UChar *a, *b;
|
||||
wchar_t *aw, *bw;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int ans = 0;
|
||||
PyObject *a_ = NULL, *b_ = NULL;
|
||||
int32_t asz = 0, bsz = 0;
|
||||
UChar *a = NULL, *b = NULL;
|
||||
uint8_t ans = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
|
||||
asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_);
|
||||
if (asz < bsz) Py_RETURN_FALSE;
|
||||
if (bsz == 0) Py_RETURN_TRUE;
|
||||
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
|
||||
|
||||
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
|
||||
b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar));
|
||||
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
|
||||
bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t));
|
||||
a = python_to_icu(a_, &asz, 1);
|
||||
if (a == NULL) goto end;
|
||||
b = python_to_icu(b_, &bsz, 1);
|
||||
if (b == NULL) goto end;
|
||||
|
||||
if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory();
|
||||
if (asz < bsz) goto end;
|
||||
if (bsz == 0) { ans = 1; goto end; }
|
||||
|
||||
actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
|
||||
actual_b = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1);
|
||||
if (actual_a > -1 && actual_b > -1) {
|
||||
u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status);
|
||||
u_strFromWCS(b, bsz*4 + 1, &actual_b, bw, -1, &status);
|
||||
ans = ucol_equal(self->collator, a, bsz, b, bsz);
|
||||
|
||||
if (U_SUCCESS(status) && ucol_equal(self->collator, a, actual_b, b, actual_b))
|
||||
ans = 1;
|
||||
}
|
||||
end:
|
||||
if (a != NULL) free(a);
|
||||
if (b != NULL) free(b);
|
||||
|
||||
free(a); free(b); free(aw); free(bw);
|
||||
if (ans) Py_RETURN_TRUE;
|
||||
if (PyErr_Occurred()) return NULL;
|
||||
if (ans) { Py_RETURN_TRUE; }
|
||||
Py_RETURN_FALSE;
|
||||
} // }}}
|
||||
|
||||
// Collator.startswith {{{
|
||||
// Collator.collation_order {{{
|
||||
static PyObject *
|
||||
icu_Collator_collation_order(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||
PyObject *a_;
|
||||
int32_t asz;
|
||||
int32_t actual_a;
|
||||
UChar *a;
|
||||
wchar_t *aw;
|
||||
PyObject *a_ = NULL;
|
||||
int32_t asz = 0;
|
||||
UChar *a = NULL;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollationElements *iter = NULL;
|
||||
int order = 0, len = -1;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "U", &a_)) return NULL;
|
||||
asz = (int32_t)PyUnicode_GetSize(a_);
|
||||
if (!PyArg_ParseTuple(args, "O", &a_)) return NULL;
|
||||
|
||||
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
|
||||
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
|
||||
a = python_to_icu(a_, &asz, 1);
|
||||
if (a == NULL) goto end;
|
||||
|
||||
if (a == NULL || aw == NULL ) return PyErr_NoMemory();
|
||||
|
||||
actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
|
||||
if (actual_a > -1) {
|
||||
u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status);
|
||||
iter = ucol_openElements(self->collator, a, actual_a, &status);
|
||||
if (iter != NULL && U_SUCCESS(status)) {
|
||||
order = ucol_next(iter, &status);
|
||||
len = ucol_getOffset(iter);
|
||||
ucol_closeElements(iter); iter = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
free(a); free(aw);
|
||||
iter = ucol_openElements(self->collator, a, asz, &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
|
||||
order = ucol_next(iter, &status);
|
||||
len = ucol_getOffset(iter);
|
||||
end:
|
||||
if (iter != NULL) ucol_closeElements(iter); iter = NULL;
|
||||
if (a != NULL) free(a);
|
||||
if (PyErr_Occurred()) return NULL;
|
||||
return Py_BuildValue("ii", order, len);
|
||||
} // }}}
|
||||
|
||||
// Collator.upper_first {{{
|
||||
static PyObject *
|
||||
icu_Collator_get_upper_first(icu_Collator *self, void *closure) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UColAttributeValue val;
|
||||
|
||||
val = ucol_getAttribute(self->collator, UCOL_CASE_FIRST, &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; }
|
||||
|
||||
if (val == UCOL_OFF) { Py_RETURN_NONE; }
|
||||
if (val) {
|
||||
Py_RETURN_TRUE;
|
||||
}
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
|
||||
static int
|
||||
icu_Collator_set_upper_first(icu_Collator *self, PyObject *val, void *closure) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ucol_setAttribute(self->collator, UCOL_CASE_FIRST, (val == Py_None) ? UCOL_OFF : ((PyObject_IsTrue(val)) ? UCOL_UPPER_FIRST : UCOL_LOWER_FIRST), &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return -1; }
|
||||
return 0;
|
||||
}
|
||||
// }}}
|
||||
|
||||
static PyObject*
|
||||
icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs);
|
||||
|
||||
@ -432,6 +397,11 @@ static PyGetSetDef icu_Collator_getsetters[] = {
|
||||
(char *)"The strength of this collator.",
|
||||
NULL},
|
||||
|
||||
{(char *)"upper_first",
|
||||
(getter)icu_Collator_get_upper_first, (setter)icu_Collator_set_upper_first,
|
||||
(char *)"Whether this collator should always put upper case letters before lower case. Values are: None - means use the tertiary strength of the letters. True - Always sort upper case before lower case. False - Always sort lower case before upper case.",
|
||||
NULL},
|
||||
|
||||
{(char *)"numeric",
|
||||
(getter)icu_Collator_get_numeric, (setter)icu_Collator_set_numeric,
|
||||
(char *)"If True the collator sorts contiguous digits as numbers rather than strings, so 2 will sort before 10.",
|
||||
@ -513,139 +483,45 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs)
|
||||
// }}}
|
||||
|
||||
|
||||
// upper {{{
|
||||
static PyObject *
|
||||
icu_upper(PyObject *self, PyObject *args) {
|
||||
char *input, *ans, *buf3 = NULL;
|
||||
const char *loc;
|
||||
int32_t sz;
|
||||
UChar *buf, *buf2;
|
||||
PyObject *ret;
|
||||
// change_case {{{
|
||||
|
||||
static PyObject* icu_change_case(PyObject *self, PyObject *args) {
|
||||
char *locale = NULL;
|
||||
PyObject *input = NULL, *result = NULL;
|
||||
int which = UPPER_CASE;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UChar *input_buf = NULL, *output_buf = NULL;
|
||||
int32_t sz = 0;
|
||||
|
||||
|
||||
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
|
||||
|
||||
sz = (int32_t)strlen(input);
|
||||
|
||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
||||
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
|
||||
|
||||
|
||||
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
|
||||
|
||||
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
|
||||
u_strToUpper(buf2, sz*8, buf, -1, loc, &status);
|
||||
|
||||
ans = input;
|
||||
sz = u_strlen(buf2);
|
||||
free(buf);
|
||||
|
||||
if (U_SUCCESS(status) && sz > 0) {
|
||||
buf3 = (char*)calloc(sz*5+1, sizeof(char));
|
||||
if (buf3 == NULL) return PyErr_NoMemory();
|
||||
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
|
||||
if (U_SUCCESS(status)) ans = buf3;
|
||||
if (!PyArg_ParseTuple(args, "Oiz", &input, &which, &locale)) return NULL;
|
||||
if (locale == NULL) {
|
||||
PyErr_SetString(PyExc_NotImplementedError, "You must specify a locale"); // We deliberately use NotImplementedError so that this error can be unambiguously identified
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
|
||||
if (ret == NULL) return PyErr_NoMemory();
|
||||
input_buf = python_to_icu(input, &sz, 1);
|
||||
if (input_buf == NULL) goto end;
|
||||
output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
|
||||
if (output_buf == NULL) { PyErr_NoMemory(); goto end; }
|
||||
|
||||
free(buf2);
|
||||
if (buf3 != NULL) free(buf3);
|
||||
PyMem_Free(input);
|
||||
|
||||
return ret;
|
||||
} // }}}
|
||||
|
||||
// lower {{{
|
||||
static PyObject *
|
||||
icu_lower(PyObject *self, PyObject *args) {
|
||||
char *input, *ans, *buf3 = NULL;
|
||||
const char *loc;
|
||||
int32_t sz;
|
||||
UChar *buf, *buf2;
|
||||
PyObject *ret;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
|
||||
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
|
||||
|
||||
sz = (int32_t)strlen(input);
|
||||
|
||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
||||
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
|
||||
|
||||
|
||||
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
|
||||
|
||||
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
|
||||
u_strToLower(buf2, sz*8, buf, -1, loc, &status);
|
||||
|
||||
ans = input;
|
||||
sz = u_strlen(buf2);
|
||||
free(buf);
|
||||
|
||||
if (U_SUCCESS(status) && sz > 0) {
|
||||
buf3 = (char*)calloc(sz*5+1, sizeof(char));
|
||||
if (buf3 == NULL) return PyErr_NoMemory();
|
||||
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
|
||||
if (U_SUCCESS(status)) ans = buf3;
|
||||
switch (which) {
|
||||
case TITLE_CASE:
|
||||
sz = u_strToTitle(output_buf, 3 * sz, input_buf, sz, NULL, locale, &status);
|
||||
break;
|
||||
case UPPER_CASE:
|
||||
sz = u_strToUpper(output_buf, 3 * sz, input_buf, sz, locale, &status);
|
||||
break;
|
||||
default:
|
||||
sz = u_strToLower(output_buf, 3 * sz, input_buf, sz, locale, &status);
|
||||
}
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
|
||||
result = icu_to_python(output_buf, sz);
|
||||
|
||||
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
|
||||
if (ret == NULL) return PyErr_NoMemory();
|
||||
end:
|
||||
if (input_buf != NULL) free(input_buf);
|
||||
if (output_buf != NULL) free(output_buf);
|
||||
return result;
|
||||
|
||||
free(buf2);
|
||||
if (buf3 != NULL) free(buf3);
|
||||
PyMem_Free(input);
|
||||
|
||||
return ret;
|
||||
} // }}}
|
||||
|
||||
// title {{{
|
||||
static PyObject *
|
||||
icu_title(PyObject *self, PyObject *args) {
|
||||
char *input, *ans, *buf3 = NULL;
|
||||
const char *loc;
|
||||
int32_t sz;
|
||||
UChar *buf, *buf2;
|
||||
PyObject *ret;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
|
||||
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
|
||||
|
||||
sz = (int32_t)strlen(input);
|
||||
|
||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
||||
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
|
||||
|
||||
|
||||
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
|
||||
|
||||
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
|
||||
u_strToTitle(buf2, sz*8, buf, -1, NULL, loc, &status);
|
||||
|
||||
ans = input;
|
||||
sz = u_strlen(buf2);
|
||||
free(buf);
|
||||
|
||||
if (U_SUCCESS(status) && sz > 0) {
|
||||
buf3 = (char*)calloc(sz*5+1, sizeof(char));
|
||||
if (buf3 == NULL) return PyErr_NoMemory();
|
||||
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
|
||||
if (U_SUCCESS(status)) ans = buf3;
|
||||
}
|
||||
|
||||
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
|
||||
if (ret == NULL) return PyErr_NoMemory();
|
||||
|
||||
free(buf2);
|
||||
if (buf3 != NULL) free(buf3);
|
||||
PyMem_Free(input);
|
||||
|
||||
return ret;
|
||||
} // }}}
|
||||
|
||||
// set_default_encoding {{{
|
||||
@ -662,7 +538,7 @@ icu_set_default_encoding(PyObject *self, PyObject *args) {
|
||||
}
|
||||
// }}}
|
||||
|
||||
// set_default_encoding {{{
|
||||
// set_filesystem_encoding {{{
|
||||
static PyObject *
|
||||
icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
|
||||
char *encoding;
|
||||
@ -674,7 +550,7 @@ icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
|
||||
}
|
||||
// }}}
|
||||
|
||||
// set_default_encoding {{{
|
||||
// get_available_transliterators {{{
|
||||
static PyObject *
|
||||
icu_get_available_transliterators(PyObject *self, PyObject *args) {
|
||||
PyObject *ans, *l;
|
||||
@ -835,16 +711,8 @@ icu_roundtrip(PyObject *self, PyObject *args) {
|
||||
|
||||
// Module initialization {{{
|
||||
static PyMethodDef icu_methods[] = {
|
||||
{"upper", icu_upper, METH_VARARGS,
|
||||
"upper(locale, unicode object) -> upper cased unicode object using locale rules."
|
||||
},
|
||||
|
||||
{"lower", icu_lower, METH_VARARGS,
|
||||
"lower(locale, unicode object) -> lower cased unicode object using locale rules."
|
||||
},
|
||||
|
||||
{"title", icu_title, METH_VARARGS,
|
||||
"title(locale, unicode object) -> Title cased unicode object using locale rules."
|
||||
{"change_case", icu_change_case, METH_VARARGS,
|
||||
"change_case(unicode object, which, locale) -> change case to one of UPPER_CASE, LOWER_CASE, TITLE_CASE"
|
||||
},
|
||||
|
||||
{"set_default_encoding", icu_set_default_encoding, METH_VARARGS,
|
||||
@ -946,5 +814,9 @@ initicu(void)
|
||||
ADDUCONST(UNORM_NFKC);
|
||||
ADDUCONST(UNORM_FCD);
|
||||
|
||||
ADDUCONST(UPPER_CASE);
|
||||
ADDUCONST(LOWER_CASE);
|
||||
ADDUCONST(TITLE_CASE);
|
||||
|
||||
}
|
||||
// }}}
|
||||
|
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
@ -7,232 +9,20 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
# Setup code {{{
|
||||
import sys
|
||||
from functools import partial
|
||||
|
||||
from calibre.constants import plugins
|
||||
from calibre.utils.config_base import tweaks
|
||||
|
||||
_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
|
||||
_locale = None
|
||||
_locale = _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None
|
||||
|
||||
_none = u''
|
||||
_none2 = b''
|
||||
|
||||
def get_locale():
|
||||
global _locale
|
||||
if _locale is None:
|
||||
from calibre.utils.localization import get_lang
|
||||
if tweaks['locale_for_sorting']:
|
||||
_locale = tweaks['locale_for_sorting']
|
||||
else:
|
||||
_locale = get_lang()
|
||||
return _locale
|
||||
|
||||
def load_icu():
|
||||
global _icu
|
||||
if _icu is None:
|
||||
_icu = plugins['icu'][0]
|
||||
if _icu is None:
|
||||
print 'Loading ICU failed with: ', plugins['icu'][1]
|
||||
else:
|
||||
if not getattr(_icu, 'ok', False):
|
||||
print 'icu not ok'
|
||||
_icu = None
|
||||
return _icu
|
||||
|
||||
def load_collator():
|
||||
'The default collator for most locales takes both case and accented letters into account'
|
||||
global _collator
|
||||
if _collator is None:
|
||||
icu = load_icu()
|
||||
if icu is not None:
|
||||
_collator = icu.Collator(get_locale())
|
||||
return _collator
|
||||
|
||||
def primary_collator():
|
||||
'Ignores case differences and accented characters'
|
||||
global _primary_collator
|
||||
if _primary_collator is None:
|
||||
_primary_collator = _collator.clone()
|
||||
_primary_collator.strength = _icu.UCOL_PRIMARY
|
||||
return _primary_collator
|
||||
|
||||
def sort_collator():
|
||||
'Ignores case differences and recognizes numbers in strings'
|
||||
global _sort_collator
|
||||
if _sort_collator is None:
|
||||
_sort_collator = _collator.clone()
|
||||
_sort_collator.strength = _icu.UCOL_SECONDARY
|
||||
if tweaks['numeric_collation']:
|
||||
try:
|
||||
_sort_collator.numeric = True
|
||||
except AttributeError:
|
||||
pass
|
||||
return _sort_collator
|
||||
|
||||
def py_sort_key(obj):
|
||||
if not obj:
|
||||
return _none
|
||||
return obj.lower()
|
||||
|
||||
def icu_sort_key(collator, obj):
|
||||
if not obj:
|
||||
return _none2
|
||||
try:
|
||||
try:
|
||||
return _sort_collator.sort_key(obj)
|
||||
except AttributeError:
|
||||
return sort_collator().sort_key(obj)
|
||||
except TypeError:
|
||||
if isinstance(obj, unicode):
|
||||
obj = obj.replace(u'\0', u'')
|
||||
else:
|
||||
obj = obj.replace(b'\0', b'')
|
||||
return _sort_collator.sort_key(obj)
|
||||
|
||||
def numeric_collator():
|
||||
global _numeric_collator
|
||||
_numeric_collator = _collator.clone()
|
||||
_numeric_collator.strength = _icu.UCOL_SECONDARY
|
||||
_numeric_collator.numeric = True
|
||||
return _numeric_collator
|
||||
|
||||
def numeric_sort_key(obj):
|
||||
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
|
||||
if not obj:
|
||||
return _none2
|
||||
try:
|
||||
try:
|
||||
return _numeric_collator.sort_key(obj)
|
||||
except AttributeError:
|
||||
return numeric_collator().sort_key(obj)
|
||||
except TypeError:
|
||||
if isinstance(obj, unicode):
|
||||
obj = obj.replace(u'\0', u'')
|
||||
else:
|
||||
obj = obj.replace(b'\0', b'')
|
||||
return _numeric_collator.sort_key(obj)
|
||||
|
||||
def icu_change_case(upper, locale, obj):
|
||||
func = _icu.upper if upper else _icu.lower
|
||||
try:
|
||||
return func(locale, obj)
|
||||
except TypeError:
|
||||
if isinstance(obj, unicode):
|
||||
obj = obj.replace(u'\0', u'')
|
||||
else:
|
||||
obj = obj.replace(b'\0', b'')
|
||||
return func(locale, obj)
|
||||
|
||||
def py_find(pattern, source):
|
||||
pos = source.find(pattern)
|
||||
if pos > -1:
|
||||
return pos, len(pattern)
|
||||
return -1, -1
|
||||
|
||||
def character_name(string):
|
||||
try:
|
||||
try:
|
||||
return _icu.character_name(unicode(string)) or None
|
||||
except AttributeError:
|
||||
import unicodedata
|
||||
return unicodedata.name(unicode(string)[0], None)
|
||||
except (TypeError, ValueError, KeyError):
|
||||
pass
|
||||
|
||||
def character_name_from_code(code):
|
||||
try:
|
||||
try:
|
||||
return _icu.character_name_from_code(code) or ''
|
||||
except AttributeError:
|
||||
import unicodedata
|
||||
return unicodedata.name(py_safe_chr(code), '')
|
||||
except (TypeError, ValueError, KeyError):
|
||||
return ''
|
||||
|
||||
if sys.maxunicode >= 0x10ffff:
|
||||
try:
|
||||
py_safe_chr = unichr
|
||||
except NameError:
|
||||
py_safe_chr = chr
|
||||
else:
|
||||
def py_safe_chr(i):
|
||||
# Narrow builds of python cannot represent code point > 0xffff as a
|
||||
# single character, so we need our own implementation of unichr
|
||||
# that returns them as a surrogate pair
|
||||
return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
|
||||
|
||||
def safe_chr(code):
|
||||
try:
|
||||
return _icu.chr(code)
|
||||
except AttributeError:
|
||||
return py_safe_chr(code)
|
||||
|
||||
def normalize(text, mode='NFC'):
|
||||
# This is very slightly slower than using unicodedata.normalize, so stick with
|
||||
# that unless you have very good reasons not too. Also, it's speed
|
||||
# decreases on wide python builds, where conversion to/from ICU's string
|
||||
# representation is slower.
|
||||
try:
|
||||
return _icu.normalize(_nmodes[mode], unicode(text))
|
||||
except (AttributeError, KeyError):
|
||||
import unicodedata
|
||||
return unicodedata.normalize(mode, unicode(text))
|
||||
|
||||
def icu_find(collator, pattern, source):
|
||||
try:
|
||||
return collator.find(pattern, source)
|
||||
except TypeError:
|
||||
return collator.find(unicode(pattern), unicode(source))
|
||||
|
||||
def icu_startswith(collator, a, b):
|
||||
try:
|
||||
return collator.startswith(a, b)
|
||||
except TypeError:
|
||||
return collator.startswith(unicode(a), unicode(b))
|
||||
|
||||
def py_case_sensitive_sort_key(obj):
|
||||
if not obj:
|
||||
return _none
|
||||
return obj
|
||||
|
||||
def icu_case_sensitive_sort_key(collator, obj):
|
||||
if not obj:
|
||||
return _none2
|
||||
return collator.sort_key(obj)
|
||||
|
||||
def icu_strcmp(collator, a, b):
|
||||
return collator.strcmp(lower(a), lower(b))
|
||||
|
||||
def py_strcmp(a, b):
|
||||
return cmp(a.lower(), b.lower())
|
||||
|
||||
def icu_case_sensitive_strcmp(collator, a, b):
|
||||
return collator.strcmp(a, b)
|
||||
|
||||
def icu_capitalize(s):
|
||||
s = lower(s)
|
||||
return s.replace(s[0], upper(s[0]), 1) if s else s
|
||||
|
||||
_cmap = {}
|
||||
def icu_contractions(collator):
|
||||
global _cmap
|
||||
ans = _cmap.get(collator, None)
|
||||
if ans is None:
|
||||
ans = collator.contractions()
|
||||
ans = frozenset(filter(None, ans)) if ans else {}
|
||||
_cmap[collator] = ans
|
||||
return ans
|
||||
|
||||
def icu_collation_order(collator, a):
|
||||
try:
|
||||
return collator.collation_order(a)
|
||||
except TypeError:
|
||||
return collator.collation_order(unicode(a))
|
||||
|
||||
load_icu()
|
||||
load_collator()
|
||||
_icu_not_ok = _icu is None or _collator is None
|
||||
_icu, err = plugins['icu']
|
||||
if _icu is None:
|
||||
raise RuntimeError('Failed to load icu with error: %s' % err)
|
||||
del err
|
||||
icu_unicode_version = getattr(_icu, 'unicode_version', None)
|
||||
_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
|
||||
|
||||
@ -252,290 +42,208 @@ try:
|
||||
except:
|
||||
pass
|
||||
|
||||
def collator():
|
||||
global _collator, _locale
|
||||
if _collator is None:
|
||||
if _locale is None:
|
||||
from calibre.utils.localization import get_lang
|
||||
if tweaks['locale_for_sorting']:
|
||||
_locale = tweaks['locale_for_sorting']
|
||||
else:
|
||||
_locale = get_lang()
|
||||
try:
|
||||
_collator = _icu.Collator(_locale)
|
||||
except Exception as e:
|
||||
print ('Failed to load collator for locale: %r with error %r, using English' % (_locale, e))
|
||||
_collator = _icu.Collator('en')
|
||||
return _collator
|
||||
|
||||
def change_locale(locale=None):
|
||||
global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator
|
||||
_collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None
|
||||
_locale = locale
|
||||
|
||||
def primary_collator():
|
||||
'Ignores case differences and accented characters'
|
||||
global _primary_collator
|
||||
if _primary_collator is None:
|
||||
_primary_collator = collator().clone()
|
||||
_primary_collator.strength = _icu.UCOL_PRIMARY
|
||||
return _primary_collator
|
||||
|
||||
def sort_collator():
|
||||
'Ignores case differences and recognizes numbers in strings (if the tweak is set)'
|
||||
global _sort_collator
|
||||
if _sort_collator is None:
|
||||
_sort_collator = collator().clone()
|
||||
_sort_collator.strength = _icu.UCOL_SECONDARY
|
||||
_sort_collator.numeric = tweaks['numeric_collation']
|
||||
return _sort_collator
|
||||
|
||||
def numeric_collator():
|
||||
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
|
||||
global _numeric_collator
|
||||
if _numeric_collator is None:
|
||||
_numeric_collator = collator().clone()
|
||||
_numeric_collator.strength = _icu.UCOL_SECONDARY
|
||||
_numeric_collator.numeric = True
|
||||
return _numeric_collator
|
||||
|
||||
def case_sensitive_collator():
|
||||
'Always sorts upper case letter before lower case'
|
||||
global _case_sensitive_collator
|
||||
if _case_sensitive_collator is None:
|
||||
_case_sensitive_collator = collator().clone()
|
||||
_case_sensitive_collator.numeric = sort_collator().numeric
|
||||
_case_sensitive_collator.upper_first = True
|
||||
return _case_sensitive_collator
|
||||
|
||||
# Templates that will be used to generate various concrete
|
||||
# function implementations based on different collators, to allow lazy loading
|
||||
# of collators, with maximum runtime performance
|
||||
|
||||
_sort_key_template = '''
|
||||
def {name}(obj):
|
||||
try:
|
||||
try:
|
||||
return {collator}.{func}(obj)
|
||||
except AttributeError:
|
||||
return {collator_func}().{func}(obj)
|
||||
except TypeError:
|
||||
if isinstance(obj, bytes):
|
||||
try:
|
||||
obj = obj.decode(sys.getdefaultencoding())
|
||||
except ValueError:
|
||||
return obj
|
||||
return {collator}.{func}(obj)
|
||||
return b''
|
||||
'''
|
||||
|
||||
_strcmp_template = '''
|
||||
def {name}(a, b):
|
||||
try:
|
||||
try:
|
||||
return {collator}.{func}(a, b)
|
||||
except AttributeError:
|
||||
return {collator_func}().{func}(a, b)
|
||||
except TypeError:
|
||||
if isinstance(a, bytes):
|
||||
try:
|
||||
a = a.decode(sys.getdefaultencoding())
|
||||
except ValueError:
|
||||
return cmp(a, b)
|
||||
elif a is None:
|
||||
a = u''
|
||||
if isinstance(b, bytes):
|
||||
try:
|
||||
b = b.decode(sys.getdefaultencoding())
|
||||
except ValueError:
|
||||
return cmp(a, b)
|
||||
elif b is None:
|
||||
b = u''
|
||||
return {collator}.{func}(a, b)
|
||||
'''
|
||||
|
||||
_change_case_template = '''
|
||||
def {name}(x):
|
||||
try:
|
||||
try:
|
||||
return _icu.change_case(x, _icu.{which}, _locale)
|
||||
except NotImplementedError:
|
||||
collator() # sets _locale
|
||||
return _icu.change_case(x, _icu.{which}, _locale)
|
||||
except TypeError:
|
||||
if isinstance(x, bytes):
|
||||
try:
|
||||
x = x.decode(sys.getdefaultencoding())
|
||||
except ValueError:
|
||||
return x
|
||||
return _icu.change_case(x, _icu.{which}, _locale)
|
||||
raise
|
||||
'''
|
||||
|
||||
def _make_func(template, name, **kwargs):
|
||||
l = globals()
|
||||
kwargs['name'] = name
|
||||
kwargs['func'] = kwargs.get('func', 'sort_key')
|
||||
exec template.format(**kwargs) in l
|
||||
return l[name]
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
################# The string functions ########################################
|
||||
sort_key = _make_func(_sort_key_template, 'sort_key', collator='_sort_collator', collator_func='sort_collator')
|
||||
|
||||
sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
|
||||
numeric_sort_key = _make_func(_sort_key_template, 'numeric_sort_key', collator='_numeric_collator', collator_func='numeric_collator')
|
||||
|
||||
strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
|
||||
primary_sort_key = _make_func(_sort_key_template, 'primary_sort_key', collator='_primary_collator', collator_func='primary_collator')
|
||||
|
||||
case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
|
||||
partial(icu_case_sensitive_sort_key, _collator)
|
||||
case_sensitive_sort_key = _make_func(_sort_key_template, 'case_sensitive_sort_key',
|
||||
collator='_case_sensitive_collator', collator_func='case_sensitive_collator')
|
||||
|
||||
case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
|
||||
collation_order = _make_func(_sort_key_template, 'collation_order', collator='_sort_collator', collator_func='sort_collator', func='collation_order')
|
||||
|
||||
upper = (lambda s: s.upper()) if _icu_not_ok else \
|
||||
partial(icu_change_case, True, get_locale())
|
||||
strcmp = _make_func(_strcmp_template, 'strcmp', collator='_sort_collator', collator_func='sort_collator', func='strcmp')
|
||||
|
||||
lower = (lambda s: s.lower()) if _icu_not_ok else \
|
||||
partial(icu_change_case, False, get_locale())
|
||||
case_sensitive_strcmp = _make_func(
|
||||
_strcmp_template, 'case_sensitive_strcmp', collator='_case_sensitive_collator', collator_func='case_sensitive_collator', func='strcmp')
|
||||
|
||||
title_case = (lambda s: s.title()) if _icu_not_ok else \
|
||||
partial(_icu.title, get_locale())
|
||||
primary_strcmp = _make_func(_strcmp_template, 'primary_strcmp', collator='_primary_collator', collator_func='primary_collator', func='strcmp')
|
||||
|
||||
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
|
||||
(lambda s: icu_capitalize(s))
|
||||
upper = _make_func(_change_case_template, 'upper', which='UPPER_CASE')
|
||||
|
||||
find = (py_find if _icu_not_ok else partial(icu_find, _collator))
|
||||
lower = _make_func(_change_case_template, 'lower', which='LOWER_CASE')
|
||||
|
||||
contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
|
||||
_collator)))
|
||||
title_case = _make_func(_change_case_template, 'title_case', which='TITLE_CASE')
|
||||
|
||||
def primary_strcmp(a, b):
|
||||
'strcmp that ignores case and accents on letters'
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return py_strcmp(ascii_text(a), ascii_text(b))
|
||||
capitalize = lambda x: upper(x[0]) + lower(x[1:])
|
||||
|
||||
find = _make_func(_strcmp_template, 'find', collator='_collator', collator_func='collator', func='find')
|
||||
|
||||
primary_find = _make_func(_strcmp_template, 'primary_find', collator='_primary_collator', collator_func='primary_collator', func='find')
|
||||
|
||||
startswith = _make_func(_strcmp_template, 'startswith', collator='_collator', collator_func='collator', func='startswith')
|
||||
|
||||
primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator='_primary_collator', collator_func='primary_collator', func='startswith')
|
||||
|
||||
safe_chr = _icu.chr
|
||||
|
||||
def character_name(string):
|
||||
try:
|
||||
return _primary_collator.strcmp(a, b)
|
||||
except AttributeError:
|
||||
return primary_collator().strcmp(a, b)
|
||||
return _icu.character_name(unicode(string)) or None
|
||||
except (TypeError, ValueError, KeyError):
|
||||
pass
|
||||
|
||||
def primary_find(pat, src):
|
||||
'find that ignores case and accents on letters'
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return py_find(ascii_text(pat), ascii_text(src))
|
||||
return primary_icu_find(pat, src)
|
||||
|
||||
def primary_icu_find(pat, src):
|
||||
def character_name_from_code(code):
|
||||
try:
|
||||
return icu_find(_primary_collator, pat, src)
|
||||
except AttributeError:
|
||||
return icu_find(primary_collator(), pat, src)
|
||||
return _icu.character_name_from_code(code) or ''
|
||||
except (TypeError, ValueError, KeyError):
|
||||
return ''
|
||||
|
||||
def primary_sort_key(val):
|
||||
'A sort key that ignores case and diacritics'
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return ascii_text(val).lower()
|
||||
try:
|
||||
return _primary_collator.sort_key(val)
|
||||
except AttributeError:
|
||||
return primary_collator().sort_key(val)
|
||||
def normalize(text, mode='NFC'):
|
||||
# This is very slightly slower than using unicodedata.normalize, so stick with
|
||||
# that unless you have very good reasons not too. Also, it's speed
|
||||
# decreases on wide python builds, where conversion to/from ICU's string
|
||||
# representation is slower.
|
||||
return _icu.normalize(_nmodes[mode], unicode(text))
|
||||
|
||||
def primary_startswith(a, b):
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return ascii_text(a).lower().startswith(ascii_text(b).lower())
|
||||
try:
|
||||
return icu_startswith(_primary_collator, a, b)
|
||||
except AttributeError:
|
||||
return icu_startswith(primary_collator(), a, b)
|
||||
def contractions(col=None):
|
||||
global _cmap
|
||||
col = col or _collator
|
||||
if col is None:
|
||||
col = collator()
|
||||
ans = _cmap.get(collator, None)
|
||||
if ans is None:
|
||||
ans = col.contractions()
|
||||
ans = frozenset(filter(None, ans))
|
||||
_cmap[col] = ans
|
||||
return ans
|
||||
|
||||
def collation_order(a):
|
||||
if _icu_not_ok:
|
||||
return (ord(a[0]), 1) if a else (0, 0)
|
||||
try:
|
||||
return icu_collation_order(_sort_collator, a)
|
||||
except AttributeError:
|
||||
return icu_collation_order(sort_collator(), a)
|
||||
|
||||
################################################################################
|
||||
|
||||
def test(): # {{{
|
||||
from calibre import prints
|
||||
# Data {{{
|
||||
german = '''
|
||||
Sonntag
|
||||
Montag
|
||||
Dienstag
|
||||
Januar
|
||||
Februar
|
||||
März
|
||||
Fuße
|
||||
Fluße
|
||||
Flusse
|
||||
flusse
|
||||
fluße
|
||||
flüße
|
||||
flüsse
|
||||
'''
|
||||
german_good = '''
|
||||
Dienstag
|
||||
Februar
|
||||
flusse
|
||||
Flusse
|
||||
fluße
|
||||
Fluße
|
||||
flüsse
|
||||
flüße
|
||||
Fuße
|
||||
Januar
|
||||
März
|
||||
Montag
|
||||
Sonntag'''
|
||||
french = '''
|
||||
dimanche
|
||||
lundi
|
||||
mardi
|
||||
janvier
|
||||
février
|
||||
mars
|
||||
déjà
|
||||
Meme
|
||||
deja
|
||||
même
|
||||
dejà
|
||||
bpef
|
||||
bœg
|
||||
Boef
|
||||
Mémé
|
||||
bœf
|
||||
boef
|
||||
bnef
|
||||
pêche
|
||||
pèché
|
||||
pêché
|
||||
pêche
|
||||
pêché'''
|
||||
french_good = '''
|
||||
bnef
|
||||
boef
|
||||
Boef
|
||||
bœf
|
||||
bœg
|
||||
bpef
|
||||
deja
|
||||
dejà
|
||||
déjà
|
||||
dimanche
|
||||
février
|
||||
janvier
|
||||
lundi
|
||||
mardi
|
||||
mars
|
||||
Meme
|
||||
Mémé
|
||||
même
|
||||
pèché
|
||||
pêche
|
||||
pêche
|
||||
pêché
|
||||
pêché'''
|
||||
# }}}
|
||||
|
||||
def create(l):
|
||||
l = l.decode('utf-8').splitlines()
|
||||
return [x.strip() for x in l if x.strip()]
|
||||
|
||||
def test_strcmp(entries):
|
||||
for x in entries:
|
||||
for y in entries:
|
||||
if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
|
||||
print 'strcmp failed for %r, %r'%(x, y)
|
||||
|
||||
german = create(german)
|
||||
c = _icu.Collator('de')
|
||||
c.numeric = True
|
||||
gs = list(sorted(german, key=c.sort_key))
|
||||
if gs != create(german_good):
|
||||
print 'German sorting failed'
|
||||
return
|
||||
print
|
||||
french = create(french)
|
||||
c = _icu.Collator('fr')
|
||||
c.numeric = True
|
||||
fs = list(sorted(french, key=c.sort_key))
|
||||
if fs != create(french_good):
|
||||
print 'French sorting failed (note that French fails with icu < 4.6)'
|
||||
return
|
||||
test_strcmp(german + french)
|
||||
|
||||
print '\nTesting case transforms in current locale'
|
||||
from calibre.utils.titlecase import titlecase
|
||||
for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
|
||||
print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
|
||||
print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
|
||||
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
|
||||
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
|
||||
print
|
||||
|
||||
print '\nTesting primary collation'
|
||||
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
|
||||
u'Štepánek':u'ŠtepaneK'}.iteritems():
|
||||
if primary_strcmp(k, v) != 0:
|
||||
prints('primary_strcmp() failed with %s != %s'%(k, v))
|
||||
return
|
||||
if primary_find(v, u' '+k)[0] != 1:
|
||||
prints('primary_find() failed with %s not in %s'%(v, k))
|
||||
return
|
||||
|
||||
n = character_name(safe_chr(0x1f431))
|
||||
if n != u'CAT FACE':
|
||||
raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
|
||||
|
||||
global _primary_collator
|
||||
orig = _primary_collator
|
||||
_primary_collator = _icu.Collator('es')
|
||||
if primary_strcmp(u'peña', u'pena') == 0:
|
||||
print 'Primary collation in Spanish locale failed'
|
||||
return
|
||||
_primary_collator = orig
|
||||
|
||||
print '\nTesting contractions'
|
||||
c = _icu.Collator('cs')
|
||||
if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
|
||||
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
|
||||
u'S\u030c', u'R\u030c']):
|
||||
print 'Contractions for the Czech language failed'
|
||||
return
|
||||
|
||||
print '\nTesting startswith'
|
||||
p = primary_startswith
|
||||
if (not p('asd', 'asd') or not p('asd', 'A') or
|
||||
not p('x', '')):
|
||||
print 'startswith() failed'
|
||||
return
|
||||
|
||||
print '\nTesting collation_order()'
|
||||
for group in [
|
||||
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
|
||||
('calibre', 'Charon', 'Collins'),
|
||||
('01', '1'),
|
||||
('1', '11', '13'),
|
||||
]:
|
||||
last = None
|
||||
for x in group:
|
||||
val = icu_collation_order(sort_collator(), x)
|
||||
if val[1] != 1:
|
||||
prints('collation_order() returned incorrect length for', x)
|
||||
if last is None:
|
||||
last = val
|
||||
else:
|
||||
if val != last:
|
||||
prints('collation_order() returned incorrect value for', x)
|
||||
last = val
|
||||
|
||||
# }}}
|
||||
|
||||
def test_roundtrip():
|
||||
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
||||
rp = _icu.roundtrip(r)
|
||||
if rp != r:
|
||||
raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
|
||||
|
||||
def test_normalize_performance():
|
||||
import os
|
||||
if not os.path.exists('t.txt'):
|
||||
return
|
||||
raw = open('t.txt', 'rb').read().decode('utf-8')
|
||||
print (len(raw))
|
||||
import time, unicodedata
|
||||
st = time.time()
|
||||
count = 100
|
||||
for i in xrange(count):
|
||||
normalize(raw)
|
||||
print ('ICU time:', time.time() - st)
|
||||
st = time.time()
|
||||
for i in xrange(count):
|
||||
unicodedata.normalize('NFC', unicode(raw))
|
||||
print ('py time:', time.time() - st)
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_roundtrip()
|
||||
test_normalize_performance()
|
||||
test()
|
||||
from calibre.utils.icu_test import run
|
||||
run(verbosity=4)
|
||||
|
||||
|
148
src/calibre/utils/icu_test.py
Normal file
148
src/calibre/utils/icu_test.py
Normal file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import unittest, sys
|
||||
from contextlib import contextmanager
|
||||
|
||||
import calibre.utils.icu as icu
|
||||
|
||||
|
||||
@contextmanager
|
||||
def make_collation_func(name, locale, numeric=True, template='_sort_key_template', func='strcmp'):
|
||||
c = icu._icu.Collator(locale)
|
||||
cname = '%s_test_collator%s' % (name, template)
|
||||
setattr(icu, cname, c)
|
||||
c.numeric = numeric
|
||||
yield icu._make_func(getattr(icu, template), name, collator=cname, collator_func='not_used_xxx', func=func)
|
||||
delattr(icu, cname)
|
||||
|
||||
class TestICU(unittest.TestCase):
|
||||
|
||||
ae = unittest.TestCase.assertEqual
|
||||
|
||||
def setUp(self):
|
||||
icu.change_locale('en')
|
||||
|
||||
def test_sorting(self):
|
||||
' Test the various sorting APIs '
|
||||
german = '''Sonntag Montag Dienstag Januar Februar März Fuße Fluße Flusse flusse fluße flüße flüsse'''.split()
|
||||
german_good = '''Dienstag Februar flusse Flusse fluße Fluße flüsse flüße Fuße Januar März Montag Sonntag'''.split()
|
||||
french = '''dimanche lundi mardi janvier février mars déjà Meme deja même dejà bpef bœg Boef Mémé bœf boef bnef pêche pèché pêché pêche pêché'''.split()
|
||||
french_good = '''bnef boef Boef bœf bœg bpef deja dejà déjà dimanche février janvier lundi mardi mars Meme Mémé même pèché pêche pêche pêché pêché'''.split() # noqa
|
||||
|
||||
# Test corner cases
|
||||
sort_key = icu.sort_key
|
||||
s = '\U0001f431'
|
||||
self.ae(sort_key(s), sort_key(s.encode(sys.getdefaultencoding())), 'UTF-8 encoded object not correctly decoded to generate sort key')
|
||||
self.ae(s.encode('utf-16'), s.encode('utf-16'), 'Undecodable bytestring not returned as itself')
|
||||
self.ae(b'', sort_key(None))
|
||||
self.ae(0, icu.strcmp(None, b''))
|
||||
self.ae(0, icu.strcmp(s, s.encode(sys.getdefaultencoding())))
|
||||
|
||||
# Test locales
|
||||
with make_collation_func('dsk', 'de', func='sort_key') as dsk:
|
||||
self.ae(german_good, sorted(german, key=dsk))
|
||||
with make_collation_func('dcmp', 'de', template='_strcmp_template') as dcmp:
|
||||
for x in german:
|
||||
for y in german:
|
||||
self.ae(cmp(dsk(x), dsk(y)), dcmp(x, y))
|
||||
|
||||
with make_collation_func('fsk', 'fr', func='sort_key') as fsk:
|
||||
self.ae(french_good, sorted(french, key=fsk))
|
||||
with make_collation_func('fcmp', 'fr', template='_strcmp_template') as fcmp:
|
||||
for x in french:
|
||||
for y in french:
|
||||
self.ae(cmp(fsk(x), fsk(y)), fcmp(x, y))
|
||||
|
||||
with make_collation_func('ssk', 'es', func='sort_key') as ssk:
|
||||
self.assertNotEqual(ssk('peña'), ssk('pena'))
|
||||
with make_collation_func('scmp', 'es', template='_strcmp_template') as scmp:
|
||||
self.assertNotEqual(0, scmp('pena', 'peña'))
|
||||
|
||||
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
|
||||
self.ae(0, icu.primary_strcmp(k, v))
|
||||
|
||||
# Test different types of collation
|
||||
self.ae(icu.primary_sort_key('Aä'), icu.primary_sort_key('aa'))
|
||||
self.assertLess(icu.numeric_sort_key('something 2'), icu.numeric_sort_key('something 11'))
|
||||
self.assertLess(icu.case_sensitive_sort_key('A'), icu.case_sensitive_sort_key('a'))
|
||||
self.ae(0, icu.strcmp('a', 'A'))
|
||||
self.ae(cmp('a', 'A'), icu.case_sensitive_strcmp('a', 'A'))
|
||||
self.ae(0, icu.primary_strcmp('ä', 'A'))
|
||||
|
||||
def test_change_case(self):
|
||||
' Test the various ways of changing the case '
|
||||
from calibre.utils.titlecase import titlecase
|
||||
# Test corner cases
|
||||
self.ae('A', icu.upper(b'a'))
|
||||
|
||||
for x in ('a', 'Alice\'s code', 'macdonald\'s machIne', '02 the wars'):
|
||||
self.ae(icu.upper(x), x.upper())
|
||||
self.ae(icu.lower(x), x.lower())
|
||||
# ICU's title case algorithm is different from ours, when there are
|
||||
# capitals inside words
|
||||
self.ae(icu.title_case(x), titlecase(x).replace('machIne', 'Machine'))
|
||||
self.ae(icu.capitalize(x), x[0].upper() + x[1:].lower())
|
||||
|
||||
def test_find(self):
|
||||
' Test searching for substrings '
|
||||
self.ae((1, 1), icu.find(b'a', b'1ab'))
|
||||
self.ae((1, 2), icu.find('\U0001f431', 'x\U0001f431x'))
|
||||
self.ae((0, 4), icu.primary_find('pena', 'peña'))
|
||||
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
|
||||
self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))
|
||||
self.assertTrue(icu.startswith(b'abc', b'ab'))
|
||||
self.assertTrue(icu.startswith('abc', 'abc'))
|
||||
self.assertFalse(icu.startswith('xyz', 'a'))
|
||||
self.assertTrue(icu.startswith('xxx', ''))
|
||||
self.assertTrue(icu.primary_startswith('pena', 'peña'))
|
||||
|
||||
def test_collation_order(self):
|
||||
'Testing collation ordering'
|
||||
for group in [
|
||||
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
|
||||
('01', '1'),
|
||||
('1', '11', '13'),
|
||||
]:
|
||||
last = None
|
||||
for x in group:
|
||||
order, length = icu.numeric_collator().collation_order(x)
|
||||
if last is not None:
|
||||
self.ae(last, order)
|
||||
last = order
|
||||
|
||||
def test_roundtrip(self):
|
||||
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
||||
self.ae(r, icu._icu.roundtrip(r))
|
||||
|
||||
def test_character_name(self):
|
||||
self.ae(icu.character_name('\U0001f431'), 'CAT FACE')
|
||||
|
||||
def test_contractions(self):
|
||||
c = icu._icu.Collator('cs')
|
||||
self.ae(icu.contractions(c), frozenset({u'Z\u030c', u'z\u030c', u'Ch',
|
||||
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
|
||||
u'S\u030c', u'R\u030c'}))
|
||||
|
||||
class TestRunner(unittest.main):
|
||||
|
||||
def createTests(self):
|
||||
tl = unittest.TestLoader()
|
||||
self.test = tl.loadTestsFromTestCase(TestICU)
|
||||
|
||||
def run(verbosity=4):
|
||||
TestRunner(verbosity=verbosity, exit=False)
|
||||
|
||||
def test_build():
|
||||
result = TestRunner(verbosity=0, buffer=True, catchbreak=True, failfast=True, argv=sys.argv[:1], exit=False).result
|
||||
if not result.wasSuccessful():
|
||||
raise SystemExit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
run(verbosity=4)
|
||||
|
Loading…
x
Reference in New Issue
Block a user