Allow searching for whole words with the ICU find() methods

This commit is contained in:
Kovid Goyal 2022-04-24 19:07:50 +05:30
parent c116696086
commit 09e06fbb82
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 38 additions and 5 deletions

View File

@ -30,6 +30,7 @@ typedef struct {
// Type-specific fields go here.
UCollator *collator;
USet *contractions;
UBreakIterator *word_iterator;
} icu_Collator;
@ -38,7 +39,9 @@ icu_Collator_dealloc(icu_Collator* self)
{
if (self->collator != NULL) ucol_close(self->collator);
if (self->contractions != NULL) uset_close(self->contractions);
self->collator = NULL;
if (self->word_iterator) ubrk_close(self->word_iterator);
self->collator = NULL; self->contractions = NULL;
self->word_iterator = NULL;
Py_TYPE(self)->tp_free((PyObject*)self);
}
@ -61,6 +64,7 @@ icu_Collator_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (self != NULL) {
self->collator = collator;
self->contractions = NULL;
self->word_iterator = NULL;
}
return (PyObject *)self;
@ -210,6 +214,23 @@ end:
} // }}}
// Collator.find {{{
static void
create_word_iterator(icu_Collator *self) {
if (self->word_iterator) return;
UErrorCode status = U_ZERO_ERROR;
const char *loc = ucol_getLocaleByType(self->collator, ULOC_VALID_LOCALE, &status);
if (U_FAILURE(status) || !loc) {
PyErr_SetString(PyExc_ValueError, "Failed to get locale for collator");
return;
}
self->word_iterator = ubrk_open(UBRK_WORD, loc, NULL, -1, &status);
if (U_FAILURE(status) || !self->word_iterator) {
PyErr_SetString(PyExc_ValueError, "Failed to create word break iterator for collator");
return;
}
}
static PyObject *
icu_Collator_find(icu_Collator *self, PyObject *args) {
PyObject *a_ = NULL, *b_ = NULL;
@ -217,15 +238,18 @@ icu_Collator_find(icu_Collator *self, PyObject *args) {
int32_t asz = 0, bsz = 0, pos = -1, length = -1;
UErrorCode status = U_ZERO_ERROR;
UStringSearch *search = NULL;
int whole_words = 0;
if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL;
if (!PyArg_ParseTuple(args, "UU|p", &a_, &b_, &whole_words)) return NULL;
if (whole_words) create_word_iterator(self);
if (PyErr_Occurred()) return NULL;
a = python_to_icu(a_, &asz);
if (a == NULL) goto end;
b = python_to_icu(b_, &bsz);
if (b == NULL) goto end;
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, whole_words ? self->word_iterator : NULL, &status);
if (U_SUCCESS(status)) {
pos = usearch_first(search, &status);
if (pos != USEARCH_DONE) {
@ -253,13 +277,16 @@ icu_Collator_find_all(icu_Collator *self, PyObject *args) {
int32_t asz = 0, bsz = 0, pos = -1, length = -1;
UErrorCode status = U_ZERO_ERROR;
UStringSearch *search = NULL;
int whole_words = 0;
if (!PyArg_ParseTuple(args, "UUO", &a_, &b_, &callback)) return NULL;
if (!PyArg_ParseTuple(args, "UUO|p", &a_, &b_, &callback, &whole_words)) return NULL;
if (whole_words) create_word_iterator(self);
if (PyErr_Occurred()) return NULL;
a = python_to_icu(a_, &asz);
b = python_to_icu(b_, &bsz);
if (a && b) {
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, whole_words ? self->word_iterator : NULL, &status);
if (search && U_SUCCESS(status)) {
pos = usearch_first(search, &status);
int32_t codepoint_count = 0, pos_for_codepoint_count = 0;
@ -620,6 +647,8 @@ icu_Collator_clone(icu_Collator *self, PyObject *args)
clone->collator = collator;
clone->contractions = NULL;
if (self->word_iterator) clone->word_iterator = ubrk_clone(self->word_iterator, &status);
else clone->word_iterator = NULL;
return (PyObject*) clone;

View File

@ -123,6 +123,10 @@ class TestICU(unittest.TestCase):
a = lambda p,l : m.append((p, l))
icu.primary_collator_without_punctuation().find_all('a', 'a a🐱a', a)
self.ae(m, [(0, 1), (2, 1), (5, 1)])
# test find whole words
c = icu.primary_collator_without_punctuation()
self.ae(c.find('a', 'abc a bc'), (0, 1))
self.ae(c.find('a', 'abc a bc', True), (4, 1))
def test_collation_order(self):
'Testing collation ordering'