Add ICU case transform algorithms

This commit is contained in:
Kovid Goyal 2010-12-04 11:48:38 -07:00
parent f424743046
commit b3868a5ef4
2 changed files with 176 additions and 4 deletions

View File

@ -131,7 +131,7 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *args, PyObject *kwargs) {
if (ans == NULL) return PyErr_NoMemory(); if (ans == NULL) return PyErr_NoMemory();
return ans; return ans;
} } // }}}
// Collator.strcmp {{{ // Collator.strcmp {{{
static PyObject * static PyObject *
@ -162,7 +162,8 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
free(a); free(b); free(a); free(b);
return Py_BuildValue("i", res); return Py_BuildValue("i", res);
} } // }}}
static PyMethodDef icu_Collator_methods[] = { static PyMethodDef icu_Collator_methods[] = {
@ -242,7 +243,156 @@ static PyTypeObject icu_CollatorType = { // {{{
// Module initialization {{{ // Module initialization {{{
// upper {{{
static PyObject *
icu_upper(PyObject *self, PyObject *args) {
char *input, *ans, *buf3 = NULL;
const char *loc;
size_t sz;
UChar *buf, *buf2;
PyObject *ret;
UErrorCode status = U_ZERO_ERROR;
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
sz = strlen(input);
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
u_strToUpper(buf2, sz*8, buf, -1, loc, &status);
ans = input;
sz = u_strlen(buf2);
free(buf);
if (U_SUCCESS(status) && sz > 0) {
buf3 = (char*)calloc(sz*5+1, sizeof(char));
if (buf3 == NULL) return PyErr_NoMemory();
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
if (U_SUCCESS(status)) ans = buf3;
}
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
if (ret == NULL) return PyErr_NoMemory();
free(buf2);
if (buf3 != NULL) free(buf3);
PyMem_Free(input);
return ret;
}
// lower {{{
static PyObject *
icu_lower(PyObject *self, PyObject *args) {
char *input, *ans, *buf3 = NULL;
const char *loc;
size_t sz;
UChar *buf, *buf2;
PyObject *ret;
UErrorCode status = U_ZERO_ERROR;
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
sz = strlen(input);
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
u_strToLower(buf2, sz*8, buf, -1, loc, &status);
ans = input;
sz = u_strlen(buf2);
free(buf);
if (U_SUCCESS(status) && sz > 0) {
buf3 = (char*)calloc(sz*5+1, sizeof(char));
if (buf3 == NULL) return PyErr_NoMemory();
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
if (U_SUCCESS(status)) ans = buf3;
}
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
if (ret == NULL) return PyErr_NoMemory();
free(buf2);
if (buf3 != NULL) free(buf3);
PyMem_Free(input);
return ret;
}
// title {{{
static PyObject *
icu_title(PyObject *self, PyObject *args) {
char *input, *ans, *buf3 = NULL;
const char *loc;
size_t sz;
UChar *buf, *buf2;
PyObject *ret;
UErrorCode status = U_ZERO_ERROR;
if (!PyArg_ParseTuple(args, "ses", &loc, "UTF-8", &input)) return NULL;
sz = strlen(input);
buf = (UChar*)calloc(sz*4 + 1, sizeof(UChar));
buf2 = (UChar*)calloc(sz*8 + 1, sizeof(UChar));
if (buf == NULL || buf2 == NULL) return PyErr_NoMemory();
u_strFromUTF8(buf, sz*4, NULL, input, sz, &status);
u_strToTitle(buf2, sz*8, buf, -1, NULL, loc, &status);
ans = input;
sz = u_strlen(buf2);
free(buf);
if (U_SUCCESS(status) && sz > 0) {
buf3 = (char*)calloc(sz*5+1, sizeof(char));
if (buf3 == NULL) return PyErr_NoMemory();
u_strToUTF8(buf3, sz*5, NULL, buf2, -1, &status);
if (U_SUCCESS(status)) ans = buf3;
}
ret = PyUnicode_DecodeUTF8(ans, strlen(ans), "replace");
if (ret == NULL) return PyErr_NoMemory();
free(buf2);
if (buf3 != NULL) free(buf3);
PyMem_Free(input);
return ret;
}
static PyMethodDef icu_methods[] = { static PyMethodDef icu_methods[] = {
{"upper", icu_upper, METH_VARARGS,
"upper(locale, unicode object) -> upper cased unicode object using locale rules."
},
{"lower", icu_lower, METH_VARARGS,
"lower(locale, unicode object) -> lower cased unicode object using locale rules."
},
{"title", icu_title, METH_VARARGS,
"title(locale, unicode object) -> Title cased unicode object using locale rules."
},
{NULL} /* Sentinel */ {NULL} /* Sentinel */
}; };

View File

@ -10,10 +10,18 @@ from functools import partial
from calibre.constants import plugins from calibre.constants import plugins
_icu = _collator = None _icu = _collator = None
_locale = None
_none = u'' _none = u''
_none2 = b'' _none2 = b''
def get_locale():
global _locale
if _locale is None:
from calibre.utils.localization import get_lang
_locale = get_lang()
return _locale
def load_icu(): def load_icu():
global _icu global _icu
if _icu is None: if _icu is None:
@ -28,11 +36,10 @@ def load_icu():
def load_collator(): def load_collator():
global _collator global _collator
from calibre.utils.localization import get_lang
if _collator is None: if _collator is None:
icu = load_icu() icu = load_icu()
if icu is not None: if icu is not None:
_collator = icu.Collator(get_lang()) _collator = icu.Collator(get_locale())
return _collator return _collator
@ -76,6 +83,13 @@ case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
icu_case_sensitive_sort_key icu_case_sensitive_sort_key
case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
upper = (lambda s: s.upper()) if _icu_not_ok else \
partial(_icu.upper, get_locale())
lower = (lambda s: s.lower()) if _icu_not_ok else \
partial(_icu.lower, get_locale())
title_case = (lambda s: s.title()) if _icu_not_ok else \
partial(_icu.title, get_locale())
def test(): # {{{ def test(): # {{{
# Data {{{ # Data {{{
@ -188,5 +202,13 @@ pêché'''
print 'French failed (note that French fails with icu < 4.6 i.e. on windows and OS X)' print 'French failed (note that French fails with icu < 4.6 i.e. on windows and OS X)'
return return
test_strcmp(german + french) test_strcmp(german + french)
print '\nTesting case transforms in current locale'
for x in ('a', 'Alice\'s code'):
print 'Upper:', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
print 'Lower:', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
print 'Title:', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8')
print
# }}} # }}}