Add an ICU implementation of strcmp

2025-07-09 03:04:10 -04:00 · 2010-12-04 10:30:04 -07:00 · 2010-12-04 10:30:04 -07:00 · a6ad9f2c96
commit a6ad9f2c96
parent c1c62e5fd3
2 changed files with 70 additions and 2 deletions
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -133,11 +133,47 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
    return ans;
 }

+// Collator.strcmp {{{
+static PyObject *
+icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
+    char *a_, *b_;
+    size_t asz, bsz;
+    UChar *a, *b;
+    UErrorCode status = U_ZERO_ERROR;
+    UCollationResult res = UCOL_EQUAL;
+  
+    if (!PyArg_ParseTuple(args, "eses", "UTF-8", &a_, "UTF-8", &b_)) return NULL;
+    
+    asz = strlen(a_); bsz = strlen(b_);
+
+    a = (UChar*)calloc(asz*4 + 1, sizeof(UChar));
+    b = (UChar*)calloc(bsz*4 + 1, sizeof(UChar));
+
+
+    if (a == NULL || b == NULL) return PyErr_NoMemory();
+
+    u_strFromUTF8(a, asz*4 + 1, NULL, a_, asz, &status);
+    u_strFromUTF8(b, bsz*4 + 1, NULL, b_, bsz, &status);
+    PyMem_Free(a_); PyMem_Free(b_);
+
+    if (U_SUCCESS(status))
+        res = ucol_strcoll(self->collator, a, -1, b, -1);
+
+    free(a); free(b);
+
+    return Py_BuildValue("i", res);
+}
+
+
 static PyMethodDef icu_Collator_methods[] = {
    {"sort_key", (PyCFunction)icu_Collator_sort_key, METH_VARARGS,
     "sort_key(unicode object) -> Return a sort key for the given object as a bytestring. The idea is that these bytestring will sort using the builtin cmp function, just like the original unicode strings would sort in the current locale with ICU."
    },

+    {"strcmp", (PyCFunction)icu_Collator_strcmp, METH_VARARGS,
+     "strcmp(unicode object, unicode object) -> strcmp(a, b) <=> cmp(sorty_key(a), sort_key(b)), but faster."
+    },
+
    {NULL}  /* Sentinel */
 };

--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -46,10 +46,35 @@ def icu_sort_key(collator, obj):
        return _none2
    return collator.sort_key(obj.lower())

+def py_case_sensitive_sort_key(obj):
+    if not obj:
+        return _none
+    return obj
+
+def icu_case_sensitive_sort_key(collator, obj):
+    if not obj:
+        return _none2
+    return collator.sort_key(obj)
+
+def icu_strcmp(collator, a, b):
+    return collator.strcmp(a.lower(), b.lower())
+
+def py_strcmp(a, b):
+    return cmp(a.lower(), b.lower())
+
+def icu_case_sensitive_strcmp(collator, a, b):
+    return collator.strcmp(a, b)
+
+
 load_icu()
 load_collator()
-sort_key = py_sort_key if _icu is None or _collator is None else \
-        partial(icu_sort_key, _collator)
+_icu_not_ok = _icu is None or _collator is None
+
+sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
+strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
+case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
+        icu_case_sensitive_sort_key
+case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp


 def test(): # {{{
@ -137,6 +162,12 @@ pêché'''
        l = l.decode('utf-8').splitlines()
        return [x.strip() for x in l if x.strip()]

+    def test_strcmp(entries):
+        for x in entries:
+            for y in entries:
+                if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
+                    print 'strcmp failed for %r, %r'%(x, y)
+
    german = create(german)
    c = _icu.Collator('de')
    print 'Sorted german:: (%s)'%c.actual_locale
@ -156,5 +187,6 @@ pêché'''
    if fs != create(french_good):
        print 'French failed (note that French fails with icu < 4.6 i.e. on windows and OS X)'
        return
+    test_strcmp(german + french)
 # }}}