mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Expose some more functionality from the ICU library to python
This commit is contained in:
parent
1e7c76df09
commit
5509f6ac38
@ -5,6 +5,7 @@
|
|||||||
#include <unicode/uclean.h>
|
#include <unicode/uclean.h>
|
||||||
#include <unicode/ucol.h>
|
#include <unicode/ucol.h>
|
||||||
#include <unicode/ustring.h>
|
#include <unicode/ustring.h>
|
||||||
|
#include <unicode/usearch.h>
|
||||||
|
|
||||||
|
|
||||||
// Collator object definition {{{
|
// Collator object definition {{{
|
||||||
@ -63,7 +64,7 @@ icu_Collator_display_name(icu_Collator *self, void *closure) {
|
|||||||
|
|
||||||
u_strToUTF8(buf, 100, NULL, dname, -1, &status);
|
u_strToUTF8(buf, 100, NULL, dname, -1, &status);
|
||||||
if (U_FAILURE(status)) {
|
if (U_FAILURE(status)) {
|
||||||
PyErr_SetString(PyExc_Exception, "Failed ot convert dname to UTF-8"); return NULL;
|
PyErr_SetString(PyExc_Exception, "Failed to convert dname to UTF-8"); return NULL;
|
||||||
}
|
}
|
||||||
return Py_BuildValue("s", buf);
|
return Py_BuildValue("s", buf);
|
||||||
}
|
}
|
||||||
@ -164,7 +165,91 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
|||||||
return Py_BuildValue("i", res);
|
return Py_BuildValue("i", res);
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
|
// Collator.find {{{
|
||||||
|
static PyObject *
|
||||||
|
icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
|
PyObject *a_, *b_;
|
||||||
|
size_t asz, bsz;
|
||||||
|
UChar *a, *b;
|
||||||
|
wchar_t *aw, *bw;
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
UStringSearch *search = NULL;
|
||||||
|
int32_t pos = -1, length = -1;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
|
||||||
|
asz = PyUnicode_GetSize(a_); bsz = PyUnicode_GetSize(b_);
|
||||||
|
|
||||||
|
a = (UChar*)calloc(asz*4 + 2, sizeof(UChar));
|
||||||
|
b = (UChar*)calloc(bsz*4 + 2, sizeof(UChar));
|
||||||
|
aw = (wchar_t*)calloc(asz*4 + 2, sizeof(wchar_t));
|
||||||
|
bw = (wchar_t*)calloc(bsz*4 + 2, sizeof(wchar_t));
|
||||||
|
|
||||||
|
if (a == NULL || b == NULL || aw == NULL || bw == NULL) return PyErr_NoMemory();
|
||||||
|
|
||||||
|
PyUnicode_AsWideChar((PyUnicodeObject*)a_, aw, asz*4+1);
|
||||||
|
PyUnicode_AsWideChar((PyUnicodeObject*)b_, bw, bsz*4+1);
|
||||||
|
u_strFromWCS(a, asz*4 + 1, NULL, aw, -1, &status);
|
||||||
|
u_strFromWCS(b, bsz*4 + 1, NULL, bw, -1, &status);
|
||||||
|
|
||||||
|
if (U_SUCCESS(status)) {
|
||||||
|
search = usearch_openFromCollator(a, -1, b, -1, self->collator, NULL, &status);
|
||||||
|
if (U_SUCCESS(status)) {
|
||||||
|
pos = usearch_first(search, &status);
|
||||||
|
if (pos != USEARCH_DONE)
|
||||||
|
length = (pos == USEARCH_DONE) ? -1 : usearch_getMatchedLength(search);
|
||||||
|
else
|
||||||
|
pos = -1;
|
||||||
|
}
|
||||||
|
if (search != NULL) usearch_close(search);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(a); free(b); free(aw); free(bw);
|
||||||
|
|
||||||
|
return Py_BuildValue("ii", pos, length);
|
||||||
|
} // }}}
|
||||||
|
|
||||||
|
// Collator.contractions {{{
|
||||||
|
static PyObject *
|
||||||
|
icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) {
|
||||||
|
USet *contractions;
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
UChar *str;
|
||||||
|
UChar32 start=0, end=0;
|
||||||
|
int32_t count = 0, len = 0, dlen = 0, i;
|
||||||
|
PyObject *ans = Py_None, *pbuf;
|
||||||
|
wchar_t *buf;
|
||||||
|
|
||||||
|
str = (UChar*)calloc(100, sizeof(UChar));
|
||||||
|
buf = (wchar_t*)calloc(4*100+2, sizeof(wchar_t));
|
||||||
|
if (str == NULL || buf == NULL) return PyErr_NoMemory();
|
||||||
|
|
||||||
|
contractions = uset_open(1, 0);
|
||||||
|
ucol_getContractionsAndExpansions(self->collator, contractions, NULL, 0, &status);
|
||||||
|
if (U_SUCCESS(status)) {
|
||||||
|
count = uset_getItemCount(contractions);
|
||||||
|
ans = PyTuple_New(count);
|
||||||
|
if (ans != NULL) {
|
||||||
|
for (i = 0; i < count; i++) {
|
||||||
|
len = uset_getItem(contractions, i, &start, &end, str, 1000, &status);
|
||||||
|
if (len >= 2) {
|
||||||
|
// We have a string
|
||||||
|
status = U_ZERO_ERROR;
|
||||||
|
u_strToWCS(buf, 4*100 + 1, &dlen, str, len, &status);
|
||||||
|
pbuf = PyUnicode_FromWideChar(buf, dlen);
|
||||||
|
if (pbuf == NULL) return PyErr_NoMemory();
|
||||||
|
PyTuple_SetItem(ans, i, pbuf);
|
||||||
|
} else {
|
||||||
|
// Ranges dont make sense for contractions, ignore them
|
||||||
|
PyTuple_SetItem(ans, i, Py_None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uset_close(contractions);
|
||||||
|
free(str); free(buf);
|
||||||
|
|
||||||
|
return Py_BuildValue("O", ans);
|
||||||
|
} // }}}
|
||||||
|
|
||||||
static PyMethodDef icu_Collator_methods[] = {
|
static PyMethodDef icu_Collator_methods[] = {
|
||||||
{"sort_key", (PyCFunction)icu_Collator_sort_key, METH_VARARGS,
|
{"sort_key", (PyCFunction)icu_Collator_sort_key, METH_VARARGS,
|
||||||
@ -175,6 +260,13 @@ static PyMethodDef icu_Collator_methods[] = {
|
|||||||
"strcmp(unicode object, unicode object) -> strcmp(a, b) <=> cmp(sorty_key(a), sort_key(b)), but faster."
|
"strcmp(unicode object, unicode object) -> strcmp(a, b) <=> cmp(sorty_key(a), sort_key(b)), but faster."
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{"find", (PyCFunction)icu_Collator_find, METH_VARARGS,
|
||||||
|
"find(pattern, source) -> returns the position and length of the first occurrence of pattern in source. Returns (-1, -1) if not found."
|
||||||
|
},
|
||||||
|
|
||||||
|
{"contractions", (PyCFunction)icu_Collator_contractions, METH_VARARGS,
|
||||||
|
"contractions() -> returns the contractions defined for this collator."
|
||||||
|
},
|
||||||
{NULL} /* Sentinel */
|
{NULL} /* Sentinel */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -59,6 +59,18 @@ def icu_sort_key(collator, obj):
|
|||||||
return _none2
|
return _none2
|
||||||
return collator.sort_key(lower(obj))
|
return collator.sort_key(lower(obj))
|
||||||
|
|
||||||
|
def py_find(pattern, source):
|
||||||
|
pos = source.find(pattern)
|
||||||
|
if pos > -1:
|
||||||
|
return pos, len(pattern)
|
||||||
|
return -1, -1
|
||||||
|
|
||||||
|
def icu_find(collator, pattern, source):
|
||||||
|
try:
|
||||||
|
return collator.find(pattern, source)
|
||||||
|
except TypeError:
|
||||||
|
return collator.find(unicode(pattern), unicode(source))
|
||||||
|
|
||||||
def py_case_sensitive_sort_key(obj):
|
def py_case_sensitive_sort_key(obj):
|
||||||
if not obj:
|
if not obj:
|
||||||
return _none
|
return _none
|
||||||
@ -82,6 +94,16 @@ def icu_capitalize(s):
|
|||||||
s = lower(s)
|
s = lower(s)
|
||||||
return s.replace(s[0], upper(s[0]), 1) if s else s
|
return s.replace(s[0], upper(s[0]), 1) if s else s
|
||||||
|
|
||||||
|
_cmap = {}
|
||||||
|
def icu_contractions(collator):
|
||||||
|
global _cmap
|
||||||
|
ans = _cmap.get(collator, None)
|
||||||
|
if ans is None:
|
||||||
|
ans = collator.contractions()
|
||||||
|
ans = frozenset(ans) if ans else {}
|
||||||
|
_cmap[collator] = ans
|
||||||
|
return ans
|
||||||
|
|
||||||
load_icu()
|
load_icu()
|
||||||
load_collator()
|
load_collator()
|
||||||
_icu_not_ok = _icu is None or _collator is None
|
_icu_not_ok = _icu is None or _collator is None
|
||||||
@ -117,6 +139,11 @@ title_case = (lambda s: s.title()) if _icu_not_ok else \
|
|||||||
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
|
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
|
||||||
(lambda s: icu_capitalize(s))
|
(lambda s: icu_capitalize(s))
|
||||||
|
|
||||||
|
find = (py_find if _icu_not_ok else partial(icu_find, _collator))
|
||||||
|
|
||||||
|
contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
|
||||||
|
_collator)))
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
def test(): # {{{
|
def test(): # {{{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user