Use a cloned collator for primary collation

This commit is contained in:
Kovid Goyal 2012-07-04 22:10:56 +05:30
parent f8dfd7fdda
commit bb606bc3ab
2 changed files with 67 additions and 33 deletions

View File

@ -32,18 +32,18 @@ icu_Collator_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
icu_Collator *self;
const char *loc;
UErrorCode status = U_ZERO_ERROR;
UCollator *collator;
if (!PyArg_ParseTuple(args, "s", &loc)) return NULL;
collator = ucol_open(loc, &status);
if (collator == NULL || U_FAILURE(status)) {
PyErr_SetString(PyExc_Exception, "Failed to create collator.");
return NULL;
}
self = (icu_Collator *)type->tp_alloc(type, 0);
if (self != NULL) {
self->collator = ucol_open(loc, &status);
if (self->collator == NULL || U_FAILURE(status)) {
PyErr_SetString(PyExc_Exception, "Failed to create collator.");
self->collator = NULL;
Py_DECREF(self);
return NULL;
}
self->collator = collator;
self->contractions = NULL;
}
@ -302,6 +302,10 @@ icu_Collator_span_contractions(icu_Collator *self, PyObject *args, PyObject *kwa
return Py_BuildValue("i", uset_span(self->contractions, s, slen, span_type));
} // }}}
static PyObject*
icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs);
static PyMethodDef icu_Collator_methods[] = {
{"sort_key", (PyCFunction)icu_Collator_sort_key, METH_VARARGS,
"sort_key(unicode object) -> Return a sort key for the given object as a bytestring. The idea is that these bytestring will sort using the builtin cmp function, just like the original unicode strings would sort in the current locale with ICU."
@ -323,6 +327,10 @@ static PyMethodDef icu_Collator_methods[] = {
"span_contractions(src, span_condition) -> returns the length of the initial substring according to span_condition in the set of contractions for this collator. Returns 0 if src does not fit the span_condition. The span_condition can be one of USET_SPAN_NOT_CONTAINED, USET_SPAN_CONTAINED, USET_SPAN_SIMPLE."
},
{"clone", (PyCFunction)icu_Collator_clone, METH_VARARGS,
"clone() -> returns a clone of this collator."
},
{NULL} /* Sentinel */
};
@ -390,6 +398,31 @@ static PyTypeObject icu_CollatorType = { // {{{
// }}
// Collator.clone {{{
static PyObject*
icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs)
{
UCollator *collator;
UErrorCode status = U_ZERO_ERROR;
int32_t bufsize = -1;
icu_Collator *clone;
collator = ucol_safeClone(self->collator, NULL, &bufsize, &status);
if (collator == NULL || U_FAILURE(status)) {
PyErr_SetString(PyExc_Exception, "Failed to create collator.");
return NULL;
}
clone = PyObject_New(icu_Collator, &icu_CollatorType);
if (clone == NULL) return PyErr_NoMemory();
clone->collator = collator;
clone->contractions = NULL;
return (PyObject*) clone;
} // }}}
// }}}

View File

@ -12,7 +12,7 @@ from functools import partial
from calibre.constants import plugins
from calibre.utils.config_base import tweaks
_icu = _collator = None
_icu = _collator = _primary_collator = None
_locale = None
_none = u''
@ -48,6 +48,12 @@ def load_collator():
_collator = icu.Collator(get_locale())
return _collator
def primary_collator():
global _primary_collator
if _primary_collator is None:
_primary_collator = _collator.clone()
_primary_collator.strength = _icu.UCOL_PRIMARY
return _primary_collator
def py_sort_key(obj):
if not obj:
@ -65,18 +71,11 @@ def py_find(pattern, source):
return pos, len(pattern)
return -1, -1
def icu_find(collator, pattern, source, strength=None):
if strength is not None:
ostrength = collator.strength
collator.strength = strength
try:
def icu_find(collator, pattern, source):
try:
return collator.find(pattern, source)
except TypeError:
return collator.find(unicode(pattern), unicode(source))
finally:
if strength is not None:
collator.strength = ostrength
def py_case_sensitive_sort_key(obj):
if not obj:
@ -88,18 +87,8 @@ def icu_case_sensitive_sort_key(collator, obj):
return _none2
return collator.sort_key(obj)
def icu_strcmp(collator, a, b, strength=None):
if strength is not None:
ostrength = collator.strength
collator.strength = strength
try:
s = collator.strength
if s >= _icu.UCOL_TERTIARY:
a, b = lower(a), lower(b)
return collator.strcmp(a, b)
finally:
if strength is not None:
collator.strength = ostrength
def icu_strcmp(collator, a, b):
return collator.strcmp(lower(a), lower(b))
def py_strcmp(a, b, strength=None):
return cmp(a.lower(), b.lower())
@ -183,14 +172,14 @@ def primary_strcmp(a, b):
if _icu_not_ok:
from calibre.utils.filenames import ascii_text
return py_strcmp(ascii_text(a), ascii_text(b))
return icu_strcmp(_collator, a, b, _icu.UCOL_PRIMARY)
return primary_collator().strcmp(a, b)
def primary_find(pat, src):
'find that ignores case and accents on letters'
if _icu_not_ok:
from calibre.utils.filenames import ascii_text
return py_find(ascii_text(pat), ascii_text(src))
return icu_find(_collator, pat, src, _icu.UCOL_PRIMARY)
return icu_find(primary_collator(), pat, src)
################################################################################
@ -315,6 +304,18 @@ pêché'''
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
print
print '\nTesting primary collation'
for k, v in {u'pèché': u'peche', u'flüße':u'flusse'}.iteritems():
if primary_strcmp(k, v) != 0:
print 'primary_strcmp() failed with %s != %s'%(k, v)
if primary_find(v, u' '+k)[0] != 1:
print 'primary_find() failed with %s not in %s'%(v, k)
global _primary_collator
_primary_collator = _icu.Collator('es')
if primary_strcmp(u'peña', u'pena') == 0:
print 'Primary collation in Spanish locale failed'
# }}}
if __name__ == '__main__':