Add ICU case transform algorithms

2025-07-09 03:04:10 -04:00 · 2010-12-04 11:48:38 -07:00 · 2010-12-04 11:48:38 -07:00 · b3868a5ef4
commit b3868a5ef4
parent f424743046
2 changed files with 176 additions and 4 deletions
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -131,7 +131,7 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
    if (ans == NULL) return PyErr_NoMemory();
    return ans;
-}
+} // }}}
 // Collator.strcmp {{{
 static PyObject *
@ -162,7 +162,8 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
    free(a); free(b);
    return Py_BuildValue("i", res);
-}
+} // }}}
 static PyMethodDef icu_Collator_methods[] = {
@ -242,7 +243,156 @@ static PyTypeObject icu_CollatorType = { // {{{
 // Module initialization {{{
 // upper {{{
 static PyObject *
 icu_upper(PyObject *self, PyObject *args) {
    char *input, *ans, *buf3 = NULL;
    const char *loc;
    size_t sz;
    UChar *buf, *buf2;
    PyObject *ret;
    UErrorCode status = U_ZERO_ERROR;
    if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
    sz = strlen(input);
    buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
    buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
    if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
    u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
    u_strToUpper(buf2, sz*8, buf, -1, loc, &status);
    ans = input;
    sz = u_strlen(buf2);
    free(buf);
    if (U_SUCCESS(status) && sz > 0) {
        buf3 = (char*)calloc(sz*5+1, sizeof(char));
        if (buf3 == NULL) return PyErr_NoMemory();
        u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
        if (U_SUCCESS(status)) ans = buf3;
    }
    ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
    if (ret == NULL) return PyErr_NoMemory();
    free(buf2);
    if (buf3 != NULL) free(buf3);
    PyMem_Free(input);
    return ret;
 }
 // lower {{{
 static PyObject *
 icu_lower(PyObject *self, PyObject *args) {
    char *input, *ans, *buf3 = NULL;
    const char *loc;
    size_t sz;
    UChar *buf, *buf2;
    PyObject *ret;
    UErrorCode status = U_ZERO_ERROR;
    if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
    sz = strlen(input);
    buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
    buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
    if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
    u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
    u_strToLower(buf2, sz*8, buf, -1, loc, &status);
    ans = input;
    sz = u_strlen(buf2);
    free(buf);
    if (U_SUCCESS(status) && sz > 0) {
        buf3 = (char*)calloc(sz*5+1, sizeof(char));
        if (buf3 == NULL) return PyErr_NoMemory();
        u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
        if (U_SUCCESS(status)) ans = buf3;
    }
    ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
    if (ret == NULL) return PyErr_NoMemory();
    free(buf2);
    if (buf3 != NULL) free(buf3);
    PyMem_Free(input);
    return ret;
 }
 // title {{{
 static PyObject *
 icu_title(PyObject *self, PyObject *args) {
    char *input, *ans, *buf3 = NULL;
    const char *loc;
    size_t sz;
    UChar *buf, *buf2;
    PyObject *ret;
    UErrorCode status = U_ZERO_ERROR;
    if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
    sz = strlen(input);
    buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
    buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
    if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
    u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
    u_strToTitle(buf2, sz*8, buf, -1, NULL, loc, &status);
    ans = input;
    sz = u_strlen(buf2);
    free(buf);
    if (U_SUCCESS(status) && sz > 0) {
        buf3 = (char*)calloc(sz*5+1, sizeof(char));
        if (buf3 == NULL) return PyErr_NoMemory();
        u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
        if (U_SUCCESS(status)) ans = buf3;
    }
    ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
    if (ret == NULL) return PyErr_NoMemory();
    free(buf2);
    if (buf3 != NULL) free(buf3);
    PyMem_Free(input);
    return ret;
 }
 static PyMethodDef icu_methods[] = {
    {"upper", icu_upper, METH_VARARGS,
        "upper(locale, unicode object) -> upper cased unicode object using locale rules."
    },
    {"lower", icu_lower, METH_VARARGS,
        "lower(locale, unicode object) -> lower cased unicode object using locale rules."
    },
    {"title", icu_title, METH_VARARGS,
        "title(locale, unicode object) -> Title cased unicode object using locale rules."
    },
    {NULL}  /* Sentinel */
 };
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -10,10 +10,18 @@ from functools import partial
 from calibre.constants import plugins
 _icu = _collator = None
 _locale = None
 _none = u''
 _none2 = b''
 def get_locale():
    global _locale
    if _locale is None:
        from calibre.utils.localization import get_lang
        _locale = get_lang()
    return _locale
 def load_icu():
    global _icu
    if _icu is None:
@ -28,11 +36,10 @@ def load_icu():
 def load_collator():
    global _collator
    from calibre.utils.localization import get_lang
    if _collator is None:
        icu = load_icu()
        if icu is not None:
-            _collator = icu.Collator(get_lang())
+            _collator = icu.Collator(get_locale())
    return _collator
@ -76,6 +83,13 @@ case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
        icu_case_sensitive_sort_key
 case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
 upper = (lambda s: s.upper()) if _icu_not_ok else \
    partial(_icu.upper, get_locale())
 lower = (lambda s: s.lower()) if _icu_not_ok else \
    partial(_icu.lower, get_locale())
 title_case = (lambda s: s.title()) if _icu_not_ok else \
    partial(_icu.title, get_locale())
 def test(): # {{{
    # Data {{{
@ -188,5 +202,13 @@ pêché'''
        print 'French failed (note that French fails with icu < 4.6 i.e. on windows and OS X)'
        return
    test_strcmp(german + french)
    print '\nTesting case transforms in current locale'
    for x in ('a', 'Alice\'s code'):
        print 'Upper:', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
        print 'Lower:', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
        print 'Title:', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8')
        print
 # }}}