Implement find_all for ICU collators

This commit is contained in:
Kovid Goyal 2022-04-23 14:33:41 +05:30
parent 50f4b86f9e
commit 8df81bb212
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 48 additions and 1 deletions

View File

@ -218,7 +218,7 @@ icu_Collator_find(icu_Collator *self, PyObject *args) {
UErrorCode status = U_ZERO_ERROR;
UStringSearch *search = NULL;
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
a = python_to_icu(a_, &asz);
if (a == NULL) goto end;
@ -245,6 +245,44 @@ end:
return (PyErr_Occurred()) ? NULL : Py_BuildValue("ll", (long)pos, (long)length);
} // }}}
// Collator.find_all {{{
static PyObject *
icu_Collator_find_all(icu_Collator *self, PyObject *args) {
PyObject *a_ = NULL, *b_ = NULL, *callback;
UChar *a = NULL, *b = NULL;
int32_t asz = 0, bsz = 0, pos = -1, length = -1;
UErrorCode status = U_ZERO_ERROR;
UStringSearch *search = NULL;
if (!PyArg_ParseTuple(args, "UUO", &a_, &b_, &callback)) return NULL;
a = python_to_icu(a_, &asz);
b = python_to_icu(b_, &bsz);
if (a && b) {
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
if (search && U_SUCCESS(status)) {
pos = usearch_first(search, &status);
int32_t codepoint_count = 0, pos_for_codepoint_count = 0;
while (pos != USEARCH_DONE) {
codepoint_count += u_countChar32(b + pos_for_codepoint_count, pos - pos_for_codepoint_count);
pos_for_codepoint_count = pos;
length = usearch_getMatchedLength(search);
length = u_countChar32(b + pos, length);
PyObject *ret = PyObject_CallFunction(callback, "ii", pos, length);
if (ret && ret == Py_None) pos = usearch_next(search, &status);
else pos = USEARCH_DONE;
Py_CLEAR(ret);
}
} else PyErr_SetString(PyExc_ValueError, u_errorName(status));
}
if (search != NULL) usearch_close(search);
if (a != NULL) free(a);
if (b != NULL) free(b);
if (PyErr_Occurred()) return NULL;
Py_RETURN_NONE;
} // }}}
// Collator.contains {{{
static PyObject *
icu_Collator_contains(icu_Collator *self, PyObject *args) {
@ -444,6 +482,10 @@ static PyMethodDef icu_Collator_methods[] = {
"strcmp(unicode object, unicode object) -> strcmp(a, b) <=> cmp(sorty_key(a), sort_key(b)), but faster."
},
{"find_all", (PyCFunction)icu_Collator_find_all, METH_VARARGS,
"find(pattern, source, callback) -> reports the position and length of all occurrences of pattern in source to callback. Aborts if callback returns anything other than None."
},
{"find", (PyCFunction)icu_Collator_find, METH_VARARGS,
"find(pattern, source) -> returns the position and length of the first occurrence of pattern in source. Returns (-1, -1) if not found."
},

View File

@ -118,6 +118,11 @@ class TestICU(unittest.TestCase):
self.ae((0, 13), icu.primary_no_punc_find("typographers", 'typographers'))
self.ae((0, 7), icu.primary_no_punc_find('abcd', 'a\u00adb\u200cc\u200dd'))
self.ae((0, 5), icu.primary_no_punc_find('abcd', 'ab cd'))
# test find all
m = []
a = lambda p,l : m.append((p, l))
icu.primary_collator_without_punctuation().find_all('a', 'a a🐱a', a)
self.ae(m, [(0, 1), (2, 1), (5, 1)])
def test_collation_order(self):
'Testing collation ordering'