From a6ad9f2c969c542b1e5b14a0d1fe8a378649045a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 Dec 2010 10:30:04 -0700 Subject: [PATCH] Add an ICU implementation of strcmp --- src/calibre/utils/icu.c | 36 ++++++++++++++++++++++++++++++++++++ src/calibre/utils/icu.py | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index f981abe62e..79a888f272 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -133,11 +133,47 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) { return ans; } +// Collator.strcmp {{{ +static PyObject * +icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) { + char *a_, *b_; + size_t asz, bsz; + UChar *a, *b; + UErrorCode status = U_ZERO_ERROR; + UCollationResult res = UCOL_EQUAL; + + if (!PyArg_ParseTuple(args, "eses", "UTF-8", &a_, "UTF-8", &b_)) return NULL; + + asz = strlen(a_); bsz = strlen(b_); + + a = (UChar*)calloc(asz*4 + 1, sizeof(UChar)); + b = (UChar*)calloc(bsz*4 + 1, sizeof(UChar)); + + + if (a == NULL || b == NULL) return PyErr_NoMemory(); + + u_strFromUTF8(a, asz*4 + 1, NULL, a_, asz, &status); + u_strFromUTF8(b, bsz*4 + 1, NULL, b_, bsz, &status); + PyMem_Free(a_); PyMem_Free(b_); + + if (U_SUCCESS(status)) + res = ucol_strcoll(self->collator, a, -1, b, -1); + + free(a); free(b); + + return Py_BuildValue("i", res); +} + + static PyMethodDef icu_Collator_methods[] = { {"sort_key", (PyCFunction)icu_Collator_sort_key, METH_VARARGS, "sort_key(unicode object) -> Return a sort key for the given object as a bytestring. The idea is that these bytestring will sort using the builtin cmp function, just like the original unicode strings would sort in the current locale with ICU." }, + {"strcmp", (PyCFunction)icu_Collator_strcmp, METH_VARARGS, + "strcmp(unicode object, unicode object) -> strcmp(a, b) <=> cmp(sorty_key(a), sort_key(b)), but faster." + }, + {NULL} /* Sentinel */ }; diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 7c2fd31f78..5251380973 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -46,10 +46,35 @@ def icu_sort_key(collator, obj): return _none2 return collator.sort_key(obj.lower()) +def py_case_sensitive_sort_key(obj): + if not obj: + return _none + return obj + +def icu_case_sensitive_sort_key(collator, obj): + if not obj: + return _none2 + return collator.sort_key(obj) + +def icu_strcmp(collator, a, b): + return collator.strcmp(a.lower(), b.lower()) + +def py_strcmp(a, b): + return cmp(a.lower(), b.lower()) + +def icu_case_sensitive_strcmp(collator, a, b): + return collator.strcmp(a, b) + + load_icu() load_collator() -sort_key = py_sort_key if _icu is None or _collator is None else \ - partial(icu_sort_key, _collator) +_icu_not_ok = _icu is None or _collator is None + +sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator) +strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator) +case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \ + icu_case_sensitive_sort_key +case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp def test(): # {{{ @@ -137,6 +162,12 @@ pêché''' l = l.decode('utf-8').splitlines() return [x.strip() for x in l if x.strip()] + def test_strcmp(entries): + for x in entries: + for y in entries: + if strcmp(x, y) != cmp(sort_key(x), sort_key(y)): + print 'strcmp failed for %r, %r'%(x, y) + german = create(german) c = _icu.Collator('de') print 'Sorted german:: (%s)'%c.actual_locale @@ -156,5 +187,6 @@ pêché''' if fs != create(french_good): print 'French failed (note that French fails with icu < 4.6 i.e. on windows and OS X)' return + test_strcmp(german + french) # }}}