mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Performance improvements and code cleanup for the ICU module
This commit is contained in:
parent
b8e414f18b
commit
f078cd7168
@ -14,13 +14,13 @@ from PyQt4.Qt import (QLineEdit, QAbstractListModel, Qt, pyqtSignal, QObject,
|
|||||||
QApplication, QListView, QPoint, QModelIndex, QFont, QFontInfo)
|
QApplication, QListView, QPoint, QModelIndex, QFont, QFontInfo)
|
||||||
|
|
||||||
from calibre.constants import isosx, get_osx_version
|
from calibre.constants import isosx, get_osx_version
|
||||||
from calibre.utils.icu import sort_key, primary_startswith, primary_icu_find
|
from calibre.utils.icu import sort_key, primary_startswith, primary_find
|
||||||
from calibre.gui2 import NONE
|
from calibre.gui2 import NONE
|
||||||
from calibre.gui2.widgets import EnComboBox, LineEditECM
|
from calibre.gui2.widgets import EnComboBox, LineEditECM
|
||||||
from calibre.utils.config import tweaks
|
from calibre.utils.config import tweaks
|
||||||
|
|
||||||
def containsq(x, prefix):
|
def containsq(x, prefix):
|
||||||
return primary_icu_find(prefix, x)[0] != -1
|
return primary_find(prefix, x)[0] != -1
|
||||||
|
|
||||||
class CompleteModel(QAbstractListModel): # {{{
|
class CompleteModel(QAbstractListModel): # {{{
|
||||||
|
|
||||||
|
@ -113,10 +113,9 @@ def test_ssl():
|
|||||||
print ('SSL OK!')
|
print ('SSL OK!')
|
||||||
|
|
||||||
def test_icu():
|
def test_icu():
|
||||||
from calibre.utils.icu import _icu_not_ok, test_roundtrip
|
print ('Testing ICU')
|
||||||
if _icu_not_ok:
|
from calibre.utils.icu_test import test_build
|
||||||
raise RuntimeError('ICU module not loaded/valid')
|
test_build()
|
||||||
test_roundtrip()
|
|
||||||
print ('ICU OK!')
|
print ('ICU OK!')
|
||||||
|
|
||||||
def test_wpd():
|
def test_wpd():
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
#include "icu_calibre_utils.h"
|
#include "icu_calibre_utils.h"
|
||||||
|
|
||||||
|
#define UPPER_CASE 0
|
||||||
|
#define LOWER_CASE 1
|
||||||
|
#define TITLE_CASE 2
|
||||||
|
|
||||||
static PyObject* uchar_to_unicode(const UChar *src, int32_t len) {
|
static PyObject* uchar_to_unicode(const UChar *src, int32_t len) {
|
||||||
wchar_t *buf = NULL;
|
wchar_t *buf = NULL;
|
||||||
PyObject *ans = NULL;
|
PyObject *ans = NULL;
|
||||||
@ -66,20 +70,16 @@ icu_Collator_display_name(icu_Collator *self, void *closure) {
|
|||||||
const char *loc = NULL;
|
const char *loc = NULL;
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
UChar dname[400];
|
UChar dname[400];
|
||||||
char buf[100];
|
int32_t sz = 0;
|
||||||
|
|
||||||
loc = ucol_getLocaleByType(self->collator, ULOC_ACTUAL_LOCALE, &status);
|
loc = ucol_getLocaleByType(self->collator, ULOC_ACTUAL_LOCALE, &status);
|
||||||
if (loc == NULL || U_FAILURE(status)) {
|
if (loc == NULL) {
|
||||||
PyErr_SetString(PyExc_Exception, "Failed to get actual locale"); return NULL;
|
PyErr_SetString(PyExc_Exception, "Failed to get actual locale"); return NULL;
|
||||||
}
|
}
|
||||||
ucol_getDisplayName(loc, "en", dname, 100, &status);
|
sz = ucol_getDisplayName(loc, "en", dname, sizeof(dname), &status);
|
||||||
if (U_FAILURE(status)) return PyErr_NoMemory();
|
if (U_FAILURE(status)) {PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; }
|
||||||
|
|
||||||
u_strToUTF8(buf, 100, NULL, dname, -1, &status);
|
return icu_to_python(dname, sz);
|
||||||
if (U_FAILURE(status)) {
|
|
||||||
PyErr_SetString(PyExc_Exception, "Failed to convert dname to UTF-8"); return NULL;
|
|
||||||
}
|
|
||||||
return Py_BuildValue("s", buf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// }}}
|
// }}}
|
||||||
@ -140,47 +140,29 @@ icu_Collator_capsule(icu_Collator *self, void *closure) {
|
|||||||
// Collator.sort_key {{{
|
// Collator.sort_key {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
char *input;
|
int32_t sz = 0, key_size = 0, bsz = 0;
|
||||||
int32_t sz;
|
UChar *buf = NULL;
|
||||||
UChar *buf;
|
uint8_t *buf2 = NULL;
|
||||||
uint8_t *buf2;
|
PyObject *ans = NULL, *input = NULL;
|
||||||
PyObject *ans;
|
|
||||||
int32_t key_size;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "es", "UTF-8", &input)) return NULL;
|
if (!PyArg_ParseTuple(args, "O", &input)) return NULL;
|
||||||
|
buf = python_to_icu(input, &sz, 1);
|
||||||
|
if (buf == NULL) return NULL;
|
||||||
|
|
||||||
sz = (int32_t)strlen(input);
|
bsz = 7 * sz + 1;
|
||||||
|
buf2 = (uint8_t*)calloc(bsz, sizeof(uint8_t));
|
||||||
|
if (buf2 == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
key_size = ucol_getSortKey(self->collator, buf, sz, buf2, bsz);
|
||||||
|
if (key_size > bsz) {
|
||||||
|
buf2 = realloc(buf2, (key_size + 1) * sizeof(uint8_t));
|
||||||
|
if (buf2 == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
key_size = ucol_getSortKey(self->collator, buf, sz, buf2, key_size + 1);
|
||||||
|
}
|
||||||
|
ans = PyBytes_FromStringAndSize((char*)buf2, key_size);
|
||||||
|
|
||||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
end:
|
||||||
|
if (buf != NULL) free(buf);
|
||||||
if (buf == NULL) return PyErr_NoMemory();
|
if (buf2 != NULL) free(buf2);
|
||||||
|
|
||||||
u_strFromUTF8(buf, sz*4 + 1, &key_size, input, sz, &status);
|
|
||||||
PyMem_Free(input);
|
|
||||||
|
|
||||||
if (U_SUCCESS(status)) {
|
|
||||||
buf2 = (uint8_t*)calloc(7*sz+1, sizeof(uint8_t));
|
|
||||||
if (buf2 == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
key_size = ucol_getSortKey(self->collator, buf, -1, buf2, 7*sz+1);
|
|
||||||
|
|
||||||
if (key_size == 0) {
|
|
||||||
ans = PyBytes_FromString("");
|
|
||||||
} else {
|
|
||||||
if (key_size >= 7*sz+1) {
|
|
||||||
free(buf2);
|
|
||||||
buf2 = (uint8_t*)calloc(key_size+1, sizeof(uint8_t));
|
|
||||||
if (buf2 == NULL) return PyErr_NoMemory();
|
|
||||||
ucol_getSortKey(self->collator, buf, -1, buf2, key_size+1);
|
|
||||||
}
|
|
||||||
ans = PyBytes_FromString((char *)buf2);
|
|
||||||
}
|
|
||||||
free(buf2);
|
|
||||||
} else ans = PyBytes_FromString("");
|
|
||||||
|
|
||||||
free(buf);
|
|
||||||
if (ans == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
return ans;
|
return ans;
|
||||||
} // }}}
|
} // }}}
|
||||||
@ -188,86 +170,64 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
|||||||
// Collator.strcmp {{{
|
// Collator.strcmp {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
char *a_, *b_;
|
PyObject *a_ = NULL, *b_ = NULL;
|
||||||
int32_t asz, bsz;
|
int32_t asz = 0, bsz = 0;
|
||||||
UChar *a, *b;
|
UChar *a = NULL, *b = NULL;
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
|
||||||
UCollationResult res = UCOL_EQUAL;
|
UCollationResult res = UCOL_EQUAL;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "eses", "UTF-8", &a_, "UTF-8", &b_)) return NULL;
|
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
|
||||||
|
|
||||||
asz = (int32_t)strlen(a_); bsz = (int32_t)strlen(b_);
|
|
||||||
|
|
||||||
a = (UChar*)calloc(asz*4 + 1, sizeof(UChar));
|
a = python_to_icu(a_, &asz, 1);
|
||||||
b = (UChar*)calloc(bsz*4 + 1, sizeof(UChar));
|
if (a == NULL) goto end;
|
||||||
|
b = python_to_icu(b_, &bsz, 1);
|
||||||
|
if (b == NULL) goto end;
|
||||||
|
res = ucol_strcoll(self->collator, a, asz, b, bsz);
|
||||||
|
end:
|
||||||
|
if (a != NULL) free(a); if (b != NULL) free(b);
|
||||||
|
|
||||||
|
return (PyErr_Occurred()) ? NULL : Py_BuildValue("i", res);
|
||||||
if (a == NULL || b == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
u_strFromUTF8(a, asz*4 + 1, NULL, a_, asz, &status);
|
|
||||||
u_strFromUTF8(b, bsz*4 + 1, NULL, b_, bsz, &status);
|
|
||||||
PyMem_Free(a_); PyMem_Free(b_);
|
|
||||||
|
|
||||||
if (U_SUCCESS(status))
|
|
||||||
res = ucol_strcoll(self->collator, a, -1, b, -1);
|
|
||||||
|
|
||||||
free(a); free(b);
|
|
||||||
|
|
||||||
return Py_BuildValue("i", res);
|
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
// Collator.find {{{
|
// Collator.find {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
PyObject *a_, *b_;
|
PyObject *a_ = NULL, *b_ = NULL;
|
||||||
int32_t asz, bsz;
|
UChar *a = NULL, *b = NULL;
|
||||||
UChar *a, *b;
|
int32_t asz = 0, bsz = 0, pos = -1, length = -1;
|
||||||
wchar_t *aw, *bw;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
UStringSearch *search = NULL;
|
UStringSearch *search = NULL;
|
||||||
int32_t pos = -1, length = -1;
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
|
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
|
||||||
asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_);
|
|
||||||
|
|
||||||
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
|
|
||||||
b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar));
|
|
||||||
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
|
|
||||||
bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t));
|
|
||||||
|
|
||||||
if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory();
|
a = python_to_icu(a_, &asz, 1);
|
||||||
|
if (a == NULL) goto end;
|
||||||
PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
|
b = python_to_icu(b_, &bsz, 1);
|
||||||
PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1);
|
if (b == NULL) goto end;
|
||||||
u_strFromWCS(a, asz*4 + 1, NULL, aw, -1, &status);
|
|
||||||
u_strFromWCS(b, bsz*4 + 1, NULL, bw, -1, &status);
|
|
||||||
|
|
||||||
|
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
|
||||||
if (U_SUCCESS(status)) {
|
if (U_SUCCESS(status)) {
|
||||||
search = usearch_openFromCollator(a, -1, b, -1, self->collator, NULL, &status);
|
pos = usearch_first(search, &status);
|
||||||
if (U_SUCCESS(status)) {
|
if (pos != USEARCH_DONE)
|
||||||
pos = usearch_first(search, &status);
|
length = usearch_getMatchedLength(search);
|
||||||
if (pos != USEARCH_DONE)
|
else
|
||||||
length = usearch_getMatchedLength(search);
|
pos = -1;
|
||||||
else
|
|
||||||
pos = -1;
|
|
||||||
}
|
|
||||||
if (search != NULL) usearch_close(search);
|
|
||||||
}
|
}
|
||||||
|
end:
|
||||||
|
if (search != NULL) usearch_close(search);
|
||||||
|
if (a != NULL) free(a);
|
||||||
|
if (b != NULL) free(b);
|
||||||
|
|
||||||
free(a); free(b); free(aw); free(bw);
|
return (PyErr_Occurred()) ? NULL : Py_BuildValue("ii", pos, length);
|
||||||
|
|
||||||
return Py_BuildValue("ii", pos, length);
|
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
// Collator.contractions {{{
|
// Collator.contractions {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
UChar *str;
|
UChar *str = NULL;
|
||||||
UChar32 start=0, end=0;
|
UChar32 start=0, end=0;
|
||||||
int32_t count = 0, len = 0, dlen = 0, i;
|
int32_t count = 0, len = 0, i;
|
||||||
PyObject *ans = Py_None, *pbuf;
|
PyObject *ans = Py_None, *pbuf;
|
||||||
wchar_t *buf;
|
|
||||||
|
|
||||||
if (self->contractions == NULL) {
|
if (self->contractions == NULL) {
|
||||||
self->contractions = uset_open(1, 0);
|
self->contractions = uset_open(1, 0);
|
||||||
@ -275,107 +235,112 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs)
|
|||||||
self->contractions = ucol_getTailoredSet(self->collator, &status);
|
self->contractions = ucol_getTailoredSet(self->collator, &status);
|
||||||
}
|
}
|
||||||
status = U_ZERO_ERROR;
|
status = U_ZERO_ERROR;
|
||||||
|
count = uset_getItemCount(self->contractions);
|
||||||
|
|
||||||
str = (UChar*)calloc(100, sizeof(UChar));
|
str = (UChar*)calloc(100, sizeof(UChar));
|
||||||
buf = (wchar_t*)calloc(4*100+2, sizeof(wchar_t));
|
if (str == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
if (str == NULL || buf == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
count = uset_getItemCount(self->contractions);
|
|
||||||
ans = PyTuple_New(count);
|
ans = PyTuple_New(count);
|
||||||
if (ans != NULL) {
|
if (ans == NULL) { goto end; }
|
||||||
for (i = 0; i < count; i++) {
|
|
||||||
len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
|
for (i = 0; i < count; i++) {
|
||||||
if (len >= 2) {
|
len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
|
||||||
// We have a string
|
if (len >= 2) {
|
||||||
status = U_ZERO_ERROR;
|
// We have a string
|
||||||
u_strToWCS(buf, 4*100 + 1, &dlen, str, len, &status);
|
status = U_ZERO_ERROR;
|
||||||
pbuf = PyUnicode_FromWideChar(buf, dlen);
|
pbuf = icu_to_python(str, len);
|
||||||
if (pbuf == NULL) return PyErr_NoMemory();
|
if (pbuf == NULL) { Py_DECREF(ans); ans = NULL; goto end; }
|
||||||
PyTuple_SetItem(ans, i, pbuf);
|
PyTuple_SetItem(ans, i, pbuf);
|
||||||
} else {
|
} else {
|
||||||
// Ranges dont make sense for contractions, ignore them
|
// Ranges dont make sense for contractions, ignore them
|
||||||
PyTuple_SetItem(ans, i, Py_None);
|
PyTuple_SetItem(ans, i, Py_None); Py_INCREF(Py_None);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free(str); free(buf);
|
end:
|
||||||
|
if (str != NULL) free(str);
|
||||||
|
|
||||||
return Py_BuildValue("O", ans);
|
return ans;
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
// Collator.startswith {{{
|
// Collator.startswith {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_Collator_startswith(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
icu_Collator_startswith(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
PyObject *a_, *b_;
|
PyObject *a_ = NULL, *b_ = NULL;
|
||||||
int32_t asz, bsz;
|
int32_t asz = 0, bsz = 0;
|
||||||
int32_t actual_a, actual_b;
|
UChar *a = NULL, *b = NULL;
|
||||||
UChar *a, *b;
|
uint8_t ans = 0;
|
||||||
wchar_t *aw, *bw;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
|
||||||
int ans = 0;
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
|
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
|
||||||
asz = (int32_t)PyUnicode_GetSize(a_); bsz = (int32_t)PyUnicode_GetSize(b_);
|
|
||||||
if (asz < bsz) Py_RETURN_FALSE;
|
a = python_to_icu(a_, &asz, 1);
|
||||||
if (bsz == 0) Py_RETURN_TRUE;
|
if (a == NULL) goto end;
|
||||||
|
b = python_to_icu(b_, &bsz, 1);
|
||||||
|
if (b == NULL) goto end;
|
||||||
|
|
||||||
|
if (asz < bsz) goto end;
|
||||||
|
if (bsz == 0) { ans = 1; goto end; }
|
||||||
|
|
||||||
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
|
ans = ucol_equal(self->collator, a, bsz, b, bsz);
|
||||||
b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar));
|
|
||||||
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
|
|
||||||
bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t));
|
|
||||||
|
|
||||||
if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory();
|
end:
|
||||||
|
if (a != NULL) free(a);
|
||||||
|
if (b != NULL) free(b);
|
||||||
|
|
||||||
actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
|
if (PyErr_Occurred()) return NULL;
|
||||||
actual_b = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1);
|
if (ans) { Py_RETURN_TRUE; }
|
||||||
if (actual_a > -1 && actual_b > -1) {
|
|
||||||
u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status);
|
|
||||||
u_strFromWCS(b, bsz*4 + 1, &actual_b, bw, -1, &status);
|
|
||||||
|
|
||||||
if (U_SUCCESS(status) && ucol_equal(self->collator, a, actual_b, b, actual_b))
|
|
||||||
ans = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(a); free(b); free(aw); free(bw);
|
|
||||||
if (ans) Py_RETURN_TRUE;
|
|
||||||
Py_RETURN_FALSE;
|
Py_RETURN_FALSE;
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
// Collator.startswith {{{
|
// Collator.collation_order {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_Collator_collation_order(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
icu_Collator_collation_order(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
PyObject *a_;
|
PyObject *a_ = NULL;
|
||||||
int32_t asz;
|
int32_t asz = 0;
|
||||||
int32_t actual_a;
|
UChar *a = NULL;
|
||||||
UChar *a;
|
|
||||||
wchar_t *aw;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
UCollationElements *iter = NULL;
|
UCollationElements *iter = NULL;
|
||||||
int order = 0, len = -1;
|
int order = 0, len = -1;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "U", &a_)) return NULL;
|
if (!PyArg_ParseTuple(args, "O", &a_)) return NULL;
|
||||||
asz = (int32_t)PyUnicode_GetSize(a_);
|
|
||||||
|
|
||||||
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
|
|
||||||
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
|
|
||||||
|
|
||||||
if (a == NULL || aw == NULL ) return PyErr_NoMemory();
|
a = python_to_icu(a_, &asz, 1);
|
||||||
|
if (a == NULL) goto end;
|
||||||
|
|
||||||
actual_a = (int32_t)PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
|
iter = ucol_openElements(self->collator, a, asz, &status);
|
||||||
if (actual_a > -1) {
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
|
||||||
u_strFromWCS(a, asz*4 + 1, &actual_a, aw, -1, &status);
|
order = ucol_next(iter, &status);
|
||||||
iter = ucol_openElements(self->collator, a, actual_a, &status);
|
len = ucol_getOffset(iter);
|
||||||
if (iter != NULL && U_SUCCESS(status)) {
|
end:
|
||||||
order = ucol_next(iter, &status);
|
if (iter != NULL) ucol_closeElements(iter); iter = NULL;
|
||||||
len = ucol_getOffset(iter);
|
if (a != NULL) free(a);
|
||||||
ucol_closeElements(iter); iter = NULL;
|
if (PyErr_Occurred()) return NULL;
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
free(a); free(aw);
|
|
||||||
return Py_BuildValue("ii", order, len);
|
return Py_BuildValue("ii", order, len);
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
|
// Collator.upper_first {{{
|
||||||
|
static PyObject *
|
||||||
|
icu_Collator_get_upper_first(icu_Collator *self, void *closure) {
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
UColAttributeValue val;
|
||||||
|
|
||||||
|
val = ucol_getAttribute(self->collator, UCOL_CASE_FIRST, &status);
|
||||||
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return NULL; }
|
||||||
|
|
||||||
|
if (val == UCOL_OFF) { Py_RETURN_NONE; }
|
||||||
|
if (val) {
|
||||||
|
Py_RETURN_TRUE;
|
||||||
|
}
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
icu_Collator_set_upper_first(icu_Collator *self, PyObject *val, void *closure) {
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
ucol_setAttribute(self->collator, UCOL_CASE_FIRST, (val == Py_None) ? UCOL_OFF : ((PyObject_IsTrue(val)) ? UCOL_UPPER_FIRST : UCOL_LOWER_FIRST), &status);
|
||||||
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); return -1; }
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
// }}}
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs);
|
icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs);
|
||||||
|
|
||||||
@ -432,6 +397,11 @@ static PyGetSetDef icu_Collator_getsetters[] = {
|
|||||||
(char *)"The strength of this collator.",
|
(char *)"The strength of this collator.",
|
||||||
NULL},
|
NULL},
|
||||||
|
|
||||||
|
{(char *)"upper_first",
|
||||||
|
(getter)icu_Collator_get_upper_first, (setter)icu_Collator_set_upper_first,
|
||||||
|
(char *)"Whether this collator should always put upper case letters before lower case. Values are: None - means use the tertiary strength of the letters. True - Always sort upper case before lower case. False - Always sort lower case before upper case.",
|
||||||
|
NULL},
|
||||||
|
|
||||||
{(char *)"numeric",
|
{(char *)"numeric",
|
||||||
(getter)icu_Collator_get_numeric, (setter)icu_Collator_set_numeric,
|
(getter)icu_Collator_get_numeric, (setter)icu_Collator_set_numeric,
|
||||||
(char *)"If True the collator sorts contiguous digits as numbers rather than strings, so 2 will sort before 10.",
|
(char *)"If True the collator sorts contiguous digits as numbers rather than strings, so 2 will sort before 10.",
|
||||||
@ -513,139 +483,45 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs)
|
|||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
|
|
||||||
// upper {{{
|
// change_case {{{
|
||||||
static PyObject *
|
|
||||||
icu_upper(PyObject *self, PyObject *args) {
|
static PyObject* icu_change_case(PyObject *self, PyObject *args) {
|
||||||
char *input, *ans, *buf3 = NULL;
|
char *locale = NULL;
|
||||||
const char *loc;
|
PyObject *input = NULL, *result = NULL;
|
||||||
int32_t sz;
|
int which = UPPER_CASE;
|
||||||
UChar *buf, *buf2;
|
|
||||||
PyObject *ret;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
UChar *input_buf = NULL, *output_buf = NULL;
|
||||||
|
int32_t sz = 0;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
|
if (!PyArg_ParseTuple(args, "Oiz", &input, &which, &locale)) return NULL;
|
||||||
|
if (locale == NULL) {
|
||||||
sz = (int32_t)strlen(input);
|
PyErr_SetString(PyExc_NotImplementedError, "You must specify a locale"); // We deliberately use NotImplementedError so that this error can be unambiguously identified
|
||||||
|
return NULL;
|
||||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
|
||||||
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
|
|
||||||
|
|
||||||
|
|
||||||
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
|
|
||||||
u_strToUpper(buf2, sz*8, buf, -1, loc, &status);
|
|
||||||
|
|
||||||
ans = input;
|
|
||||||
sz = u_strlen(buf2);
|
|
||||||
free(buf);
|
|
||||||
|
|
||||||
if (U_SUCCESS(status) && sz > 0) {
|
|
||||||
buf3 = (char*)calloc(sz*5+1, sizeof(char));
|
|
||||||
if (buf3 == NULL) return PyErr_NoMemory();
|
|
||||||
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
|
|
||||||
if (U_SUCCESS(status)) ans = buf3;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
|
input_buf = python_to_icu(input, &sz, 1);
|
||||||
if (ret == NULL) return PyErr_NoMemory();
|
if (input_buf == NULL) goto end;
|
||||||
|
output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
|
||||||
|
if (output_buf == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
|
||||||
free(buf2);
|
switch (which) {
|
||||||
if (buf3 != NULL) free(buf3);
|
case TITLE_CASE:
|
||||||
PyMem_Free(input);
|
sz = u_strToTitle(output_buf, 3 * sz, input_buf, sz, NULL, locale, &status);
|
||||||
|
break;
|
||||||
return ret;
|
case UPPER_CASE:
|
||||||
} // }}}
|
sz = u_strToUpper(output_buf, 3 * sz, input_buf, sz, locale, &status);
|
||||||
|
break;
|
||||||
// lower {{{
|
default:
|
||||||
static PyObject *
|
sz = u_strToLower(output_buf, 3 * sz, input_buf, sz, locale, &status);
|
||||||
icu_lower(PyObject *self, PyObject *args) {
|
|
||||||
char *input, *ans, *buf3 = NULL;
|
|
||||||
const char *loc;
|
|
||||||
int32_t sz;
|
|
||||||
UChar *buf, *buf2;
|
|
||||||
PyObject *ret;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
|
||||||
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
|
|
||||||
|
|
||||||
sz = (int32_t)strlen(input);
|
|
||||||
|
|
||||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
|
||||||
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
|
|
||||||
|
|
||||||
|
|
||||||
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
|
|
||||||
u_strToLower(buf2, sz*8, buf, -1, loc, &status);
|
|
||||||
|
|
||||||
ans = input;
|
|
||||||
sz = u_strlen(buf2);
|
|
||||||
free(buf);
|
|
||||||
|
|
||||||
if (U_SUCCESS(status) && sz > 0) {
|
|
||||||
buf3 = (char*)calloc(sz*5+1, sizeof(char));
|
|
||||||
if (buf3 == NULL) return PyErr_NoMemory();
|
|
||||||
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
|
|
||||||
if (U_SUCCESS(status)) ans = buf3;
|
|
||||||
}
|
}
|
||||||
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
|
||||||
|
result = icu_to_python(output_buf, sz);
|
||||||
|
|
||||||
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
|
end:
|
||||||
if (ret == NULL) return PyErr_NoMemory();
|
if (input_buf != NULL) free(input_buf);
|
||||||
|
if (output_buf != NULL) free(output_buf);
|
||||||
|
return result;
|
||||||
|
|
||||||
free(buf2);
|
|
||||||
if (buf3 != NULL) free(buf3);
|
|
||||||
PyMem_Free(input);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
} // }}}
|
|
||||||
|
|
||||||
// title {{{
|
|
||||||
static PyObject *
|
|
||||||
icu_title(PyObject *self, PyObject *args) {
|
|
||||||
char *input, *ans, *buf3 = NULL;
|
|
||||||
const char *loc;
|
|
||||||
int32_t sz;
|
|
||||||
UChar *buf, *buf2;
|
|
||||||
PyObject *ret;
|
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
|
||||||
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
|
|
||||||
|
|
||||||
sz = (int32_t)strlen(input);
|
|
||||||
|
|
||||||
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
|
|
||||||
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
|
|
||||||
|
|
||||||
|
|
||||||
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
|
|
||||||
u_strToTitle(buf2, sz*8, buf, -1, NULL, loc, &status);
|
|
||||||
|
|
||||||
ans = input;
|
|
||||||
sz = u_strlen(buf2);
|
|
||||||
free(buf);
|
|
||||||
|
|
||||||
if (U_SUCCESS(status) && sz > 0) {
|
|
||||||
buf3 = (char*)calloc(sz*5+1, sizeof(char));
|
|
||||||
if (buf3 == NULL) return PyErr_NoMemory();
|
|
||||||
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
|
|
||||||
if (U_SUCCESS(status)) ans = buf3;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
|
|
||||||
if (ret == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
free(buf2);
|
|
||||||
if (buf3 != NULL) free(buf3);
|
|
||||||
PyMem_Free(input);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
// set_default_encoding {{{
|
// set_default_encoding {{{
|
||||||
@ -662,7 +538,7 @@ icu_set_default_encoding(PyObject *self, PyObject *args) {
|
|||||||
}
|
}
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// set_default_encoding {{{
|
// set_filesystem_encoding {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
|
icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
|
||||||
char *encoding;
|
char *encoding;
|
||||||
@ -674,7 +550,7 @@ icu_set_filesystem_encoding(PyObject *self, PyObject *args) {
|
|||||||
}
|
}
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// set_default_encoding {{{
|
// get_available_transliterators {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_get_available_transliterators(PyObject *self, PyObject *args) {
|
icu_get_available_transliterators(PyObject *self, PyObject *args) {
|
||||||
PyObject *ans, *l;
|
PyObject *ans, *l;
|
||||||
@ -835,16 +711,8 @@ icu_roundtrip(PyObject *self, PyObject *args) {
|
|||||||
|
|
||||||
// Module initialization {{{
|
// Module initialization {{{
|
||||||
static PyMethodDef icu_methods[] = {
|
static PyMethodDef icu_methods[] = {
|
||||||
{"upper", icu_upper, METH_VARARGS,
|
{"change_case", icu_change_case, METH_VARARGS,
|
||||||
"upper(locale, unicode object) -> upper cased unicode object using locale rules."
|
"change_case(unicode object, which, locale) -> change case to one of UPPER_CASE, LOWER_CASE, TITLE_CASE"
|
||||||
},
|
|
||||||
|
|
||||||
{"lower", icu_lower, METH_VARARGS,
|
|
||||||
"lower(locale, unicode object) -> lower cased unicode object using locale rules."
|
|
||||||
},
|
|
||||||
|
|
||||||
{"title", icu_title, METH_VARARGS,
|
|
||||||
"title(locale, unicode object) -> Title cased unicode object using locale rules."
|
|
||||||
},
|
},
|
||||||
|
|
||||||
{"set_default_encoding", icu_set_default_encoding, METH_VARARGS,
|
{"set_default_encoding", icu_set_default_encoding, METH_VARARGS,
|
||||||
@ -946,5 +814,9 @@ initicu(void)
|
|||||||
ADDUCONST(UNORM_NFKC);
|
ADDUCONST(UNORM_NFKC);
|
||||||
ADDUCONST(UNORM_FCD);
|
ADDUCONST(UNORM_FCD);
|
||||||
|
|
||||||
|
ADDUCONST(UPPER_CASE);
|
||||||
|
ADDUCONST(LOWER_CASE);
|
||||||
|
ADDUCONST(TITLE_CASE);
|
||||||
|
|
||||||
}
|
}
|
||||||
// }}}
|
// }}}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
@ -7,232 +9,20 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
# Setup code {{{
|
# Setup code {{{
|
||||||
import sys
|
import sys
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
from calibre.constants import plugins
|
from calibre.constants import plugins
|
||||||
from calibre.utils.config_base import tweaks
|
from calibre.utils.config_base import tweaks
|
||||||
|
|
||||||
_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
|
_locale = _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None
|
||||||
_locale = None
|
|
||||||
|
|
||||||
_none = u''
|
_none = u''
|
||||||
_none2 = b''
|
_none2 = b''
|
||||||
|
|
||||||
def get_locale():
|
|
||||||
global _locale
|
|
||||||
if _locale is None:
|
|
||||||
from calibre.utils.localization import get_lang
|
|
||||||
if tweaks['locale_for_sorting']:
|
|
||||||
_locale = tweaks['locale_for_sorting']
|
|
||||||
else:
|
|
||||||
_locale = get_lang()
|
|
||||||
return _locale
|
|
||||||
|
|
||||||
def load_icu():
|
|
||||||
global _icu
|
|
||||||
if _icu is None:
|
|
||||||
_icu = plugins['icu'][0]
|
|
||||||
if _icu is None:
|
|
||||||
print 'Loading ICU failed with: ', plugins['icu'][1]
|
|
||||||
else:
|
|
||||||
if not getattr(_icu, 'ok', False):
|
|
||||||
print 'icu not ok'
|
|
||||||
_icu = None
|
|
||||||
return _icu
|
|
||||||
|
|
||||||
def load_collator():
|
|
||||||
'The default collator for most locales takes both case and accented letters into account'
|
|
||||||
global _collator
|
|
||||||
if _collator is None:
|
|
||||||
icu = load_icu()
|
|
||||||
if icu is not None:
|
|
||||||
_collator = icu.Collator(get_locale())
|
|
||||||
return _collator
|
|
||||||
|
|
||||||
def primary_collator():
|
|
||||||
'Ignores case differences and accented characters'
|
|
||||||
global _primary_collator
|
|
||||||
if _primary_collator is None:
|
|
||||||
_primary_collator = _collator.clone()
|
|
||||||
_primary_collator.strength = _icu.UCOL_PRIMARY
|
|
||||||
return _primary_collator
|
|
||||||
|
|
||||||
def sort_collator():
|
|
||||||
'Ignores case differences and recognizes numbers in strings'
|
|
||||||
global _sort_collator
|
|
||||||
if _sort_collator is None:
|
|
||||||
_sort_collator = _collator.clone()
|
|
||||||
_sort_collator.strength = _icu.UCOL_SECONDARY
|
|
||||||
if tweaks['numeric_collation']:
|
|
||||||
try:
|
|
||||||
_sort_collator.numeric = True
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
return _sort_collator
|
|
||||||
|
|
||||||
def py_sort_key(obj):
|
|
||||||
if not obj:
|
|
||||||
return _none
|
|
||||||
return obj.lower()
|
|
||||||
|
|
||||||
def icu_sort_key(collator, obj):
|
|
||||||
if not obj:
|
|
||||||
return _none2
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _sort_collator.sort_key(obj)
|
|
||||||
except AttributeError:
|
|
||||||
return sort_collator().sort_key(obj)
|
|
||||||
except TypeError:
|
|
||||||
if isinstance(obj, unicode):
|
|
||||||
obj = obj.replace(u'\0', u'')
|
|
||||||
else:
|
|
||||||
obj = obj.replace(b'\0', b'')
|
|
||||||
return _sort_collator.sort_key(obj)
|
|
||||||
|
|
||||||
def numeric_collator():
|
|
||||||
global _numeric_collator
|
|
||||||
_numeric_collator = _collator.clone()
|
|
||||||
_numeric_collator.strength = _icu.UCOL_SECONDARY
|
|
||||||
_numeric_collator.numeric = True
|
|
||||||
return _numeric_collator
|
|
||||||
|
|
||||||
def numeric_sort_key(obj):
|
|
||||||
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
|
|
||||||
if not obj:
|
|
||||||
return _none2
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _numeric_collator.sort_key(obj)
|
|
||||||
except AttributeError:
|
|
||||||
return numeric_collator().sort_key(obj)
|
|
||||||
except TypeError:
|
|
||||||
if isinstance(obj, unicode):
|
|
||||||
obj = obj.replace(u'\0', u'')
|
|
||||||
else:
|
|
||||||
obj = obj.replace(b'\0', b'')
|
|
||||||
return _numeric_collator.sort_key(obj)
|
|
||||||
|
|
||||||
def icu_change_case(upper, locale, obj):
|
|
||||||
func = _icu.upper if upper else _icu.lower
|
|
||||||
try:
|
|
||||||
return func(locale, obj)
|
|
||||||
except TypeError:
|
|
||||||
if isinstance(obj, unicode):
|
|
||||||
obj = obj.replace(u'\0', u'')
|
|
||||||
else:
|
|
||||||
obj = obj.replace(b'\0', b'')
|
|
||||||
return func(locale, obj)
|
|
||||||
|
|
||||||
def py_find(pattern, source):
|
|
||||||
pos = source.find(pattern)
|
|
||||||
if pos > -1:
|
|
||||||
return pos, len(pattern)
|
|
||||||
return -1, -1
|
|
||||||
|
|
||||||
def character_name(string):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _icu.character_name(unicode(string)) or None
|
|
||||||
except AttributeError:
|
|
||||||
import unicodedata
|
|
||||||
return unicodedata.name(unicode(string)[0], None)
|
|
||||||
except (TypeError, ValueError, KeyError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def character_name_from_code(code):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _icu.character_name_from_code(code) or ''
|
|
||||||
except AttributeError:
|
|
||||||
import unicodedata
|
|
||||||
return unicodedata.name(py_safe_chr(code), '')
|
|
||||||
except (TypeError, ValueError, KeyError):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
if sys.maxunicode >= 0x10ffff:
|
|
||||||
try:
|
|
||||||
py_safe_chr = unichr
|
|
||||||
except NameError:
|
|
||||||
py_safe_chr = chr
|
|
||||||
else:
|
|
||||||
def py_safe_chr(i):
|
|
||||||
# Narrow builds of python cannot represent code point > 0xffff as a
|
|
||||||
# single character, so we need our own implementation of unichr
|
|
||||||
# that returns them as a surrogate pair
|
|
||||||
return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
|
|
||||||
|
|
||||||
def safe_chr(code):
|
|
||||||
try:
|
|
||||||
return _icu.chr(code)
|
|
||||||
except AttributeError:
|
|
||||||
return py_safe_chr(code)
|
|
||||||
|
|
||||||
def normalize(text, mode='NFC'):
|
|
||||||
# This is very slightly slower than using unicodedata.normalize, so stick with
|
|
||||||
# that unless you have very good reasons not too. Also, it's speed
|
|
||||||
# decreases on wide python builds, where conversion to/from ICU's string
|
|
||||||
# representation is slower.
|
|
||||||
try:
|
|
||||||
return _icu.normalize(_nmodes[mode], unicode(text))
|
|
||||||
except (AttributeError, KeyError):
|
|
||||||
import unicodedata
|
|
||||||
return unicodedata.normalize(mode, unicode(text))
|
|
||||||
|
|
||||||
def icu_find(collator, pattern, source):
|
|
||||||
try:
|
|
||||||
return collator.find(pattern, source)
|
|
||||||
except TypeError:
|
|
||||||
return collator.find(unicode(pattern), unicode(source))
|
|
||||||
|
|
||||||
def icu_startswith(collator, a, b):
|
|
||||||
try:
|
|
||||||
return collator.startswith(a, b)
|
|
||||||
except TypeError:
|
|
||||||
return collator.startswith(unicode(a), unicode(b))
|
|
||||||
|
|
||||||
def py_case_sensitive_sort_key(obj):
|
|
||||||
if not obj:
|
|
||||||
return _none
|
|
||||||
return obj
|
|
||||||
|
|
||||||
def icu_case_sensitive_sort_key(collator, obj):
|
|
||||||
if not obj:
|
|
||||||
return _none2
|
|
||||||
return collator.sort_key(obj)
|
|
||||||
|
|
||||||
def icu_strcmp(collator, a, b):
|
|
||||||
return collator.strcmp(lower(a), lower(b))
|
|
||||||
|
|
||||||
def py_strcmp(a, b):
|
|
||||||
return cmp(a.lower(), b.lower())
|
|
||||||
|
|
||||||
def icu_case_sensitive_strcmp(collator, a, b):
|
|
||||||
return collator.strcmp(a, b)
|
|
||||||
|
|
||||||
def icu_capitalize(s):
|
|
||||||
s = lower(s)
|
|
||||||
return s.replace(s[0], upper(s[0]), 1) if s else s
|
|
||||||
|
|
||||||
_cmap = {}
|
_cmap = {}
|
||||||
def icu_contractions(collator):
|
|
||||||
global _cmap
|
|
||||||
ans = _cmap.get(collator, None)
|
|
||||||
if ans is None:
|
|
||||||
ans = collator.contractions()
|
|
||||||
ans = frozenset(filter(None, ans)) if ans else {}
|
|
||||||
_cmap[collator] = ans
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def icu_collation_order(collator, a):
|
_icu, err = plugins['icu']
|
||||||
try:
|
if _icu is None:
|
||||||
return collator.collation_order(a)
|
raise RuntimeError('Failed to load icu with error: %s' % err)
|
||||||
except TypeError:
|
del err
|
||||||
return collator.collation_order(unicode(a))
|
|
||||||
|
|
||||||
load_icu()
|
|
||||||
load_collator()
|
|
||||||
_icu_not_ok = _icu is None or _collator is None
|
|
||||||
icu_unicode_version = getattr(_icu, 'unicode_version', None)
|
icu_unicode_version = getattr(_icu, 'unicode_version', None)
|
||||||
_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
|
_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
|
||||||
|
|
||||||
@ -252,290 +42,208 @@ try:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def collator():
|
||||||
|
global _collator, _locale
|
||||||
|
if _collator is None:
|
||||||
|
if _locale is None:
|
||||||
|
from calibre.utils.localization import get_lang
|
||||||
|
if tweaks['locale_for_sorting']:
|
||||||
|
_locale = tweaks['locale_for_sorting']
|
||||||
|
else:
|
||||||
|
_locale = get_lang()
|
||||||
|
try:
|
||||||
|
_collator = _icu.Collator(_locale)
|
||||||
|
except Exception as e:
|
||||||
|
print ('Failed to load collator for locale: %r with error %r, using English' % (_locale, e))
|
||||||
|
_collator = _icu.Collator('en')
|
||||||
|
return _collator
|
||||||
|
|
||||||
|
def change_locale(locale=None):
|
||||||
|
global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator
|
||||||
|
_collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None
|
||||||
|
_locale = locale
|
||||||
|
|
||||||
|
def primary_collator():
|
||||||
|
'Ignores case differences and accented characters'
|
||||||
|
global _primary_collator
|
||||||
|
if _primary_collator is None:
|
||||||
|
_primary_collator = collator().clone()
|
||||||
|
_primary_collator.strength = _icu.UCOL_PRIMARY
|
||||||
|
return _primary_collator
|
||||||
|
|
||||||
|
def sort_collator():
|
||||||
|
'Ignores case differences and recognizes numbers in strings (if the tweak is set)'
|
||||||
|
global _sort_collator
|
||||||
|
if _sort_collator is None:
|
||||||
|
_sort_collator = collator().clone()
|
||||||
|
_sort_collator.strength = _icu.UCOL_SECONDARY
|
||||||
|
_sort_collator.numeric = tweaks['numeric_collation']
|
||||||
|
return _sort_collator
|
||||||
|
|
||||||
|
def numeric_collator():
|
||||||
|
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
|
||||||
|
global _numeric_collator
|
||||||
|
if _numeric_collator is None:
|
||||||
|
_numeric_collator = collator().clone()
|
||||||
|
_numeric_collator.strength = _icu.UCOL_SECONDARY
|
||||||
|
_numeric_collator.numeric = True
|
||||||
|
return _numeric_collator
|
||||||
|
|
||||||
|
def case_sensitive_collator():
|
||||||
|
'Always sorts upper case letter before lower case'
|
||||||
|
global _case_sensitive_collator
|
||||||
|
if _case_sensitive_collator is None:
|
||||||
|
_case_sensitive_collator = collator().clone()
|
||||||
|
_case_sensitive_collator.numeric = sort_collator().numeric
|
||||||
|
_case_sensitive_collator.upper_first = True
|
||||||
|
return _case_sensitive_collator
|
||||||
|
|
||||||
|
# Templates that will be used to generate various concrete
|
||||||
|
# function implementations based on different collators, to allow lazy loading
|
||||||
|
# of collators, with maximum runtime performance
|
||||||
|
|
||||||
|
_sort_key_template = '''
|
||||||
|
def {name}(obj):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
return {collator}.{func}(obj)
|
||||||
|
except AttributeError:
|
||||||
|
return {collator_func}().{func}(obj)
|
||||||
|
except TypeError:
|
||||||
|
if isinstance(obj, bytes):
|
||||||
|
try:
|
||||||
|
obj = obj.decode(sys.getdefaultencoding())
|
||||||
|
except ValueError:
|
||||||
|
return obj
|
||||||
|
return {collator}.{func}(obj)
|
||||||
|
return b''
|
||||||
|
'''
|
||||||
|
|
||||||
|
_strcmp_template = '''
|
||||||
|
def {name}(a, b):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
return {collator}.{func}(a, b)
|
||||||
|
except AttributeError:
|
||||||
|
return {collator_func}().{func}(a, b)
|
||||||
|
except TypeError:
|
||||||
|
if isinstance(a, bytes):
|
||||||
|
try:
|
||||||
|
a = a.decode(sys.getdefaultencoding())
|
||||||
|
except ValueError:
|
||||||
|
return cmp(a, b)
|
||||||
|
elif a is None:
|
||||||
|
a = u''
|
||||||
|
if isinstance(b, bytes):
|
||||||
|
try:
|
||||||
|
b = b.decode(sys.getdefaultencoding())
|
||||||
|
except ValueError:
|
||||||
|
return cmp(a, b)
|
||||||
|
elif b is None:
|
||||||
|
b = u''
|
||||||
|
return {collator}.{func}(a, b)
|
||||||
|
'''
|
||||||
|
|
||||||
|
_change_case_template = '''
|
||||||
|
def {name}(x):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
return _icu.change_case(x, _icu.{which}, _locale)
|
||||||
|
except NotImplementedError:
|
||||||
|
collator() # sets _locale
|
||||||
|
return _icu.change_case(x, _icu.{which}, _locale)
|
||||||
|
except TypeError:
|
||||||
|
if isinstance(x, bytes):
|
||||||
|
try:
|
||||||
|
x = x.decode(sys.getdefaultencoding())
|
||||||
|
except ValueError:
|
||||||
|
return x
|
||||||
|
return _icu.change_case(x, _icu.{which}, _locale)
|
||||||
|
raise
|
||||||
|
'''
|
||||||
|
|
||||||
|
def _make_func(template, name, **kwargs):
|
||||||
|
l = globals()
|
||||||
|
kwargs['name'] = name
|
||||||
|
kwargs['func'] = kwargs.get('func', 'sort_key')
|
||||||
|
exec template.format(**kwargs) in l
|
||||||
|
return l[name]
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
################# The string functions ########################################
|
################# The string functions ########################################
|
||||||
|
sort_key = _make_func(_sort_key_template, 'sort_key', collator='_sort_collator', collator_func='sort_collator')
|
||||||
|
|
||||||
sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
|
numeric_sort_key = _make_func(_sort_key_template, 'numeric_sort_key', collator='_numeric_collator', collator_func='numeric_collator')
|
||||||
|
|
||||||
strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
|
primary_sort_key = _make_func(_sort_key_template, 'primary_sort_key', collator='_primary_collator', collator_func='primary_collator')
|
||||||
|
|
||||||
case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
|
case_sensitive_sort_key = _make_func(_sort_key_template, 'case_sensitive_sort_key',
|
||||||
partial(icu_case_sensitive_sort_key, _collator)
|
collator='_case_sensitive_collator', collator_func='case_sensitive_collator')
|
||||||
|
|
||||||
case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
|
collation_order = _make_func(_sort_key_template, 'collation_order', collator='_sort_collator', collator_func='sort_collator', func='collation_order')
|
||||||
|
|
||||||
upper = (lambda s: s.upper()) if _icu_not_ok else \
|
strcmp = _make_func(_strcmp_template, 'strcmp', collator='_sort_collator', collator_func='sort_collator', func='strcmp')
|
||||||
partial(icu_change_case, True, get_locale())
|
|
||||||
|
|
||||||
lower = (lambda s: s.lower()) if _icu_not_ok else \
|
case_sensitive_strcmp = _make_func(
|
||||||
partial(icu_change_case, False, get_locale())
|
_strcmp_template, 'case_sensitive_strcmp', collator='_case_sensitive_collator', collator_func='case_sensitive_collator', func='strcmp')
|
||||||
|
|
||||||
title_case = (lambda s: s.title()) if _icu_not_ok else \
|
primary_strcmp = _make_func(_strcmp_template, 'primary_strcmp', collator='_primary_collator', collator_func='primary_collator', func='strcmp')
|
||||||
partial(_icu.title, get_locale())
|
|
||||||
|
|
||||||
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
|
upper = _make_func(_change_case_template, 'upper', which='UPPER_CASE')
|
||||||
(lambda s: icu_capitalize(s))
|
|
||||||
|
|
||||||
find = (py_find if _icu_not_ok else partial(icu_find, _collator))
|
lower = _make_func(_change_case_template, 'lower', which='LOWER_CASE')
|
||||||
|
|
||||||
contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
|
title_case = _make_func(_change_case_template, 'title_case', which='TITLE_CASE')
|
||||||
_collator)))
|
|
||||||
|
|
||||||
def primary_strcmp(a, b):
|
capitalize = lambda x: upper(x[0]) + lower(x[1:])
|
||||||
'strcmp that ignores case and accents on letters'
|
|
||||||
if _icu_not_ok:
|
find = _make_func(_strcmp_template, 'find', collator='_collator', collator_func='collator', func='find')
|
||||||
from calibre.utils.filenames import ascii_text
|
|
||||||
return py_strcmp(ascii_text(a), ascii_text(b))
|
primary_find = _make_func(_strcmp_template, 'primary_find', collator='_primary_collator', collator_func='primary_collator', func='find')
|
||||||
|
|
||||||
|
startswith = _make_func(_strcmp_template, 'startswith', collator='_collator', collator_func='collator', func='startswith')
|
||||||
|
|
||||||
|
primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator='_primary_collator', collator_func='primary_collator', func='startswith')
|
||||||
|
|
||||||
|
safe_chr = _icu.chr
|
||||||
|
|
||||||
|
def character_name(string):
|
||||||
try:
|
try:
|
||||||
return _primary_collator.strcmp(a, b)
|
return _icu.character_name(unicode(string)) or None
|
||||||
except AttributeError:
|
except (TypeError, ValueError, KeyError):
|
||||||
return primary_collator().strcmp(a, b)
|
pass
|
||||||
|
|
||||||
def primary_find(pat, src):
|
def character_name_from_code(code):
|
||||||
'find that ignores case and accents on letters'
|
|
||||||
if _icu_not_ok:
|
|
||||||
from calibre.utils.filenames import ascii_text
|
|
||||||
return py_find(ascii_text(pat), ascii_text(src))
|
|
||||||
return primary_icu_find(pat, src)
|
|
||||||
|
|
||||||
def primary_icu_find(pat, src):
|
|
||||||
try:
|
try:
|
||||||
return icu_find(_primary_collator, pat, src)
|
return _icu.character_name_from_code(code) or ''
|
||||||
except AttributeError:
|
except (TypeError, ValueError, KeyError):
|
||||||
return icu_find(primary_collator(), pat, src)
|
return ''
|
||||||
|
|
||||||
def primary_sort_key(val):
|
def normalize(text, mode='NFC'):
|
||||||
'A sort key that ignores case and diacritics'
|
# This is very slightly slower than using unicodedata.normalize, so stick with
|
||||||
if _icu_not_ok:
|
# that unless you have very good reasons not too. Also, it's speed
|
||||||
from calibre.utils.filenames import ascii_text
|
# decreases on wide python builds, where conversion to/from ICU's string
|
||||||
return ascii_text(val).lower()
|
# representation is slower.
|
||||||
try:
|
return _icu.normalize(_nmodes[mode], unicode(text))
|
||||||
return _primary_collator.sort_key(val)
|
|
||||||
except AttributeError:
|
|
||||||
return primary_collator().sort_key(val)
|
|
||||||
|
|
||||||
def primary_startswith(a, b):
|
def contractions(col=None):
|
||||||
if _icu_not_ok:
|
global _cmap
|
||||||
from calibre.utils.filenames import ascii_text
|
col = col or _collator
|
||||||
return ascii_text(a).lower().startswith(ascii_text(b).lower())
|
if col is None:
|
||||||
try:
|
col = collator()
|
||||||
return icu_startswith(_primary_collator, a, b)
|
ans = _cmap.get(collator, None)
|
||||||
except AttributeError:
|
if ans is None:
|
||||||
return icu_startswith(primary_collator(), a, b)
|
ans = col.contractions()
|
||||||
|
ans = frozenset(filter(None, ans))
|
||||||
|
_cmap[col] = ans
|
||||||
|
return ans
|
||||||
|
|
||||||
def collation_order(a):
|
|
||||||
if _icu_not_ok:
|
|
||||||
return (ord(a[0]), 1) if a else (0, 0)
|
|
||||||
try:
|
|
||||||
return icu_collation_order(_sort_collator, a)
|
|
||||||
except AttributeError:
|
|
||||||
return icu_collation_order(sort_collator(), a)
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
def test(): # {{{
|
|
||||||
from calibre import prints
|
|
||||||
# Data {{{
|
|
||||||
german = '''
|
|
||||||
Sonntag
|
|
||||||
Montag
|
|
||||||
Dienstag
|
|
||||||
Januar
|
|
||||||
Februar
|
|
||||||
März
|
|
||||||
Fuße
|
|
||||||
Fluße
|
|
||||||
Flusse
|
|
||||||
flusse
|
|
||||||
fluße
|
|
||||||
flüße
|
|
||||||
flüsse
|
|
||||||
'''
|
|
||||||
german_good = '''
|
|
||||||
Dienstag
|
|
||||||
Februar
|
|
||||||
flusse
|
|
||||||
Flusse
|
|
||||||
fluße
|
|
||||||
Fluße
|
|
||||||
flüsse
|
|
||||||
flüße
|
|
||||||
Fuße
|
|
||||||
Januar
|
|
||||||
März
|
|
||||||
Montag
|
|
||||||
Sonntag'''
|
|
||||||
french = '''
|
|
||||||
dimanche
|
|
||||||
lundi
|
|
||||||
mardi
|
|
||||||
janvier
|
|
||||||
février
|
|
||||||
mars
|
|
||||||
déjà
|
|
||||||
Meme
|
|
||||||
deja
|
|
||||||
même
|
|
||||||
dejà
|
|
||||||
bpef
|
|
||||||
bœg
|
|
||||||
Boef
|
|
||||||
Mémé
|
|
||||||
bœf
|
|
||||||
boef
|
|
||||||
bnef
|
|
||||||
pêche
|
|
||||||
pèché
|
|
||||||
pêché
|
|
||||||
pêche
|
|
||||||
pêché'''
|
|
||||||
french_good = '''
|
|
||||||
bnef
|
|
||||||
boef
|
|
||||||
Boef
|
|
||||||
bœf
|
|
||||||
bœg
|
|
||||||
bpef
|
|
||||||
deja
|
|
||||||
dejà
|
|
||||||
déjà
|
|
||||||
dimanche
|
|
||||||
février
|
|
||||||
janvier
|
|
||||||
lundi
|
|
||||||
mardi
|
|
||||||
mars
|
|
||||||
Meme
|
|
||||||
Mémé
|
|
||||||
même
|
|
||||||
pèché
|
|
||||||
pêche
|
|
||||||
pêche
|
|
||||||
pêché
|
|
||||||
pêché'''
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
def create(l):
|
|
||||||
l = l.decode('utf-8').splitlines()
|
|
||||||
return [x.strip() for x in l if x.strip()]
|
|
||||||
|
|
||||||
def test_strcmp(entries):
|
|
||||||
for x in entries:
|
|
||||||
for y in entries:
|
|
||||||
if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
|
|
||||||
print 'strcmp failed for %r, %r'%(x, y)
|
|
||||||
|
|
||||||
german = create(german)
|
|
||||||
c = _icu.Collator('de')
|
|
||||||
c.numeric = True
|
|
||||||
gs = list(sorted(german, key=c.sort_key))
|
|
||||||
if gs != create(german_good):
|
|
||||||
print 'German sorting failed'
|
|
||||||
return
|
|
||||||
print
|
|
||||||
french = create(french)
|
|
||||||
c = _icu.Collator('fr')
|
|
||||||
c.numeric = True
|
|
||||||
fs = list(sorted(french, key=c.sort_key))
|
|
||||||
if fs != create(french_good):
|
|
||||||
print 'French sorting failed (note that French fails with icu < 4.6)'
|
|
||||||
return
|
|
||||||
test_strcmp(german + french)
|
|
||||||
|
|
||||||
print '\nTesting case transforms in current locale'
|
|
||||||
from calibre.utils.titlecase import titlecase
|
|
||||||
for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
|
|
||||||
print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
|
|
||||||
print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
|
|
||||||
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
|
|
||||||
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
|
|
||||||
print
|
|
||||||
|
|
||||||
print '\nTesting primary collation'
|
|
||||||
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
|
|
||||||
u'Štepánek':u'ŠtepaneK'}.iteritems():
|
|
||||||
if primary_strcmp(k, v) != 0:
|
|
||||||
prints('primary_strcmp() failed with %s != %s'%(k, v))
|
|
||||||
return
|
|
||||||
if primary_find(v, u' '+k)[0] != 1:
|
|
||||||
prints('primary_find() failed with %s not in %s'%(v, k))
|
|
||||||
return
|
|
||||||
|
|
||||||
n = character_name(safe_chr(0x1f431))
|
|
||||||
if n != u'CAT FACE':
|
|
||||||
raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
|
|
||||||
|
|
||||||
global _primary_collator
|
|
||||||
orig = _primary_collator
|
|
||||||
_primary_collator = _icu.Collator('es')
|
|
||||||
if primary_strcmp(u'peña', u'pena') == 0:
|
|
||||||
print 'Primary collation in Spanish locale failed'
|
|
||||||
return
|
|
||||||
_primary_collator = orig
|
|
||||||
|
|
||||||
print '\nTesting contractions'
|
|
||||||
c = _icu.Collator('cs')
|
|
||||||
if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
|
|
||||||
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
|
|
||||||
u'S\u030c', u'R\u030c']):
|
|
||||||
print 'Contractions for the Czech language failed'
|
|
||||||
return
|
|
||||||
|
|
||||||
print '\nTesting startswith'
|
|
||||||
p = primary_startswith
|
|
||||||
if (not p('asd', 'asd') or not p('asd', 'A') or
|
|
||||||
not p('x', '')):
|
|
||||||
print 'startswith() failed'
|
|
||||||
return
|
|
||||||
|
|
||||||
print '\nTesting collation_order()'
|
|
||||||
for group in [
|
|
||||||
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
|
|
||||||
('calibre', 'Charon', 'Collins'),
|
|
||||||
('01', '1'),
|
|
||||||
('1', '11', '13'),
|
|
||||||
]:
|
|
||||||
last = None
|
|
||||||
for x in group:
|
|
||||||
val = icu_collation_order(sort_collator(), x)
|
|
||||||
if val[1] != 1:
|
|
||||||
prints('collation_order() returned incorrect length for', x)
|
|
||||||
if last is None:
|
|
||||||
last = val
|
|
||||||
else:
|
|
||||||
if val != last:
|
|
||||||
prints('collation_order() returned incorrect value for', x)
|
|
||||||
last = val
|
|
||||||
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
def test_roundtrip():
|
|
||||||
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
|
||||||
rp = _icu.roundtrip(r)
|
|
||||||
if rp != r:
|
|
||||||
raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
|
|
||||||
|
|
||||||
def test_normalize_performance():
|
|
||||||
import os
|
|
||||||
if not os.path.exists('t.txt'):
|
|
||||||
return
|
|
||||||
raw = open('t.txt', 'rb').read().decode('utf-8')
|
|
||||||
print (len(raw))
|
|
||||||
import time, unicodedata
|
|
||||||
st = time.time()
|
|
||||||
count = 100
|
|
||||||
for i in xrange(count):
|
|
||||||
normalize(raw)
|
|
||||||
print ('ICU time:', time.time() - st)
|
|
||||||
st = time.time()
|
|
||||||
for i in xrange(count):
|
|
||||||
unicodedata.normalize('NFC', unicode(raw))
|
|
||||||
print ('py time:', time.time() - st)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test_roundtrip()
|
from calibre.utils.icu_test import run
|
||||||
test_normalize_performance()
|
run(verbosity=4)
|
||||||
test()
|
|
||||||
|
|
||||||
|
148
src/calibre/utils/icu_test.py
Normal file
148
src/calibre/utils/icu_test.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import unittest, sys
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
import calibre.utils.icu as icu
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def make_collation_func(name, locale, numeric=True, template='_sort_key_template', func='strcmp'):
|
||||||
|
c = icu._icu.Collator(locale)
|
||||||
|
cname = '%s_test_collator%s' % (name, template)
|
||||||
|
setattr(icu, cname, c)
|
||||||
|
c.numeric = numeric
|
||||||
|
yield icu._make_func(getattr(icu, template), name, collator=cname, collator_func='not_used_xxx', func=func)
|
||||||
|
delattr(icu, cname)
|
||||||
|
|
||||||
|
class TestICU(unittest.TestCase):
|
||||||
|
|
||||||
|
ae = unittest.TestCase.assertEqual
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
icu.change_locale('en')
|
||||||
|
|
||||||
|
def test_sorting(self):
|
||||||
|
' Test the various sorting APIs '
|
||||||
|
german = '''Sonntag Montag Dienstag Januar Februar März Fuße Fluße Flusse flusse fluße flüße flüsse'''.split()
|
||||||
|
german_good = '''Dienstag Februar flusse Flusse fluße Fluße flüsse flüße Fuße Januar März Montag Sonntag'''.split()
|
||||||
|
french = '''dimanche lundi mardi janvier février mars déjà Meme deja même dejà bpef bœg Boef Mémé bœf boef bnef pêche pèché pêché pêche pêché'''.split()
|
||||||
|
french_good = '''bnef boef Boef bœf bœg bpef deja dejà déjà dimanche février janvier lundi mardi mars Meme Mémé même pèché pêche pêche pêché pêché'''.split() # noqa
|
||||||
|
|
||||||
|
# Test corner cases
|
||||||
|
sort_key = icu.sort_key
|
||||||
|
s = '\U0001f431'
|
||||||
|
self.ae(sort_key(s), sort_key(s.encode(sys.getdefaultencoding())), 'UTF-8 encoded object not correctly decoded to generate sort key')
|
||||||
|
self.ae(s.encode('utf-16'), s.encode('utf-16'), 'Undecodable bytestring not returned as itself')
|
||||||
|
self.ae(b'', sort_key(None))
|
||||||
|
self.ae(0, icu.strcmp(None, b''))
|
||||||
|
self.ae(0, icu.strcmp(s, s.encode(sys.getdefaultencoding())))
|
||||||
|
|
||||||
|
# Test locales
|
||||||
|
with make_collation_func('dsk', 'de', func='sort_key') as dsk:
|
||||||
|
self.ae(german_good, sorted(german, key=dsk))
|
||||||
|
with make_collation_func('dcmp', 'de', template='_strcmp_template') as dcmp:
|
||||||
|
for x in german:
|
||||||
|
for y in german:
|
||||||
|
self.ae(cmp(dsk(x), dsk(y)), dcmp(x, y))
|
||||||
|
|
||||||
|
with make_collation_func('fsk', 'fr', func='sort_key') as fsk:
|
||||||
|
self.ae(french_good, sorted(french, key=fsk))
|
||||||
|
with make_collation_func('fcmp', 'fr', template='_strcmp_template') as fcmp:
|
||||||
|
for x in french:
|
||||||
|
for y in french:
|
||||||
|
self.ae(cmp(fsk(x), fsk(y)), fcmp(x, y))
|
||||||
|
|
||||||
|
with make_collation_func('ssk', 'es', func='sort_key') as ssk:
|
||||||
|
self.assertNotEqual(ssk('peña'), ssk('pena'))
|
||||||
|
with make_collation_func('scmp', 'es', template='_strcmp_template') as scmp:
|
||||||
|
self.assertNotEqual(0, scmp('pena', 'peña'))
|
||||||
|
|
||||||
|
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
|
||||||
|
self.ae(0, icu.primary_strcmp(k, v))
|
||||||
|
|
||||||
|
# Test different types of collation
|
||||||
|
self.ae(icu.primary_sort_key('Aä'), icu.primary_sort_key('aa'))
|
||||||
|
self.assertLess(icu.numeric_sort_key('something 2'), icu.numeric_sort_key('something 11'))
|
||||||
|
self.assertLess(icu.case_sensitive_sort_key('A'), icu.case_sensitive_sort_key('a'))
|
||||||
|
self.ae(0, icu.strcmp('a', 'A'))
|
||||||
|
self.ae(cmp('a', 'A'), icu.case_sensitive_strcmp('a', 'A'))
|
||||||
|
self.ae(0, icu.primary_strcmp('ä', 'A'))
|
||||||
|
|
||||||
|
def test_change_case(self):
|
||||||
|
' Test the various ways of changing the case '
|
||||||
|
from calibre.utils.titlecase import titlecase
|
||||||
|
# Test corner cases
|
||||||
|
self.ae('A', icu.upper(b'a'))
|
||||||
|
|
||||||
|
for x in ('a', 'Alice\'s code', 'macdonald\'s machIne', '02 the wars'):
|
||||||
|
self.ae(icu.upper(x), x.upper())
|
||||||
|
self.ae(icu.lower(x), x.lower())
|
||||||
|
# ICU's title case algorithm is different from ours, when there are
|
||||||
|
# capitals inside words
|
||||||
|
self.ae(icu.title_case(x), titlecase(x).replace('machIne', 'Machine'))
|
||||||
|
self.ae(icu.capitalize(x), x[0].upper() + x[1:].lower())
|
||||||
|
|
||||||
|
def test_find(self):
|
||||||
|
' Test searching for substrings '
|
||||||
|
self.ae((1, 1), icu.find(b'a', b'1ab'))
|
||||||
|
self.ae((1, 2), icu.find('\U0001f431', 'x\U0001f431x'))
|
||||||
|
self.ae((0, 4), icu.primary_find('pena', 'peña'))
|
||||||
|
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
|
||||||
|
self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))
|
||||||
|
self.assertTrue(icu.startswith(b'abc', b'ab'))
|
||||||
|
self.assertTrue(icu.startswith('abc', 'abc'))
|
||||||
|
self.assertFalse(icu.startswith('xyz', 'a'))
|
||||||
|
self.assertTrue(icu.startswith('xxx', ''))
|
||||||
|
self.assertTrue(icu.primary_startswith('pena', 'peña'))
|
||||||
|
|
||||||
|
def test_collation_order(self):
|
||||||
|
'Testing collation ordering'
|
||||||
|
for group in [
|
||||||
|
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
|
||||||
|
('01', '1'),
|
||||||
|
('1', '11', '13'),
|
||||||
|
]:
|
||||||
|
last = None
|
||||||
|
for x in group:
|
||||||
|
order, length = icu.numeric_collator().collation_order(x)
|
||||||
|
if last is not None:
|
||||||
|
self.ae(last, order)
|
||||||
|
last = order
|
||||||
|
|
||||||
|
def test_roundtrip(self):
|
||||||
|
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
||||||
|
self.ae(r, icu._icu.roundtrip(r))
|
||||||
|
|
||||||
|
def test_character_name(self):
|
||||||
|
self.ae(icu.character_name('\U0001f431'), 'CAT FACE')
|
||||||
|
|
||||||
|
def test_contractions(self):
|
||||||
|
c = icu._icu.Collator('cs')
|
||||||
|
self.ae(icu.contractions(c), frozenset({u'Z\u030c', u'z\u030c', u'Ch',
|
||||||
|
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
|
||||||
|
u'S\u030c', u'R\u030c'}))
|
||||||
|
|
||||||
|
class TestRunner(unittest.main):
|
||||||
|
|
||||||
|
def createTests(self):
|
||||||
|
tl = unittest.TestLoader()
|
||||||
|
self.test = tl.loadTestsFromTestCase(TestICU)
|
||||||
|
|
||||||
|
def run(verbosity=4):
|
||||||
|
TestRunner(verbosity=verbosity, exit=False)
|
||||||
|
|
||||||
|
def test_build():
|
||||||
|
result = TestRunner(verbosity=0, buffer=True, catchbreak=True, failfast=True, argv=sys.argv[:1], exit=False).result
|
||||||
|
if not result.wasSuccessful():
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run(verbosity=4)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user