diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index f1c305b3da..3a84213326 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -218,7 +218,7 @@ icu_Collator_find(icu_Collator *self, PyObject *args) { UErrorCode status = U_ZERO_ERROR; UStringSearch *search = NULL; - if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; + if (!PyArg_ParseTuple(args, "UU", &a_, &b_)) return NULL; a = python_to_icu(a_, &asz); if (a == NULL) goto end; @@ -245,6 +245,44 @@ end: return (PyErr_Occurred()) ? NULL : Py_BuildValue("ll", (long)pos, (long)length); } // }}} +// Collator.find_all {{{ +static PyObject * +icu_Collator_find_all(icu_Collator *self, PyObject *args) { + PyObject *a_ = NULL, *b_ = NULL, *callback; + UChar *a = NULL, *b = NULL; + int32_t asz = 0, bsz = 0, pos = -1, length = -1; + UErrorCode status = U_ZERO_ERROR; + UStringSearch *search = NULL; + + if (!PyArg_ParseTuple(args, "UUO", &a_, &b_, &callback)) return NULL; + + a = python_to_icu(a_, &asz); + b = python_to_icu(b_, &bsz); + if (a && b) { + search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status); + if (search && U_SUCCESS(status)) { + pos = usearch_first(search, &status); + int32_t codepoint_count = 0, pos_for_codepoint_count = 0; + while (pos != USEARCH_DONE) { + codepoint_count += u_countChar32(b + pos_for_codepoint_count, pos - pos_for_codepoint_count); + pos_for_codepoint_count = pos; + length = usearch_getMatchedLength(search); + length = u_countChar32(b + pos, length); + PyObject *ret = PyObject_CallFunction(callback, "ii", pos, length); + if (ret && ret == Py_None) pos = usearch_next(search, &status); + else pos = USEARCH_DONE; + Py_CLEAR(ret); + } + } else PyErr_SetString(PyExc_ValueError, u_errorName(status)); + } + if (search != NULL) usearch_close(search); + if (a != NULL) free(a); + if (b != NULL) free(b); + + if (PyErr_Occurred()) return NULL; + Py_RETURN_NONE; +} // }}} + // Collator.contains {{{ static PyObject * icu_Collator_contains(icu_Collator *self, PyObject *args) { @@ -444,6 +482,10 @@ static PyMethodDef icu_Collator_methods[] = { "strcmp(unicode object, unicode object) -> strcmp(a, b) <=> cmp(sorty_key(a), sort_key(b)), but faster." }, + {"find_all", (PyCFunction)icu_Collator_find_all, METH_VARARGS, + "find(pattern, source, callback) -> reports the position and length of all occurrences of pattern in source to callback. Aborts if callback returns anything other than None." + }, + {"find", (PyCFunction)icu_Collator_find, METH_VARARGS, "find(pattern, source) -> returns the position and length of the first occurrence of pattern in source. Returns (-1, -1) if not found." }, diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 6840777b6a..2c5d7b2428 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -118,6 +118,11 @@ class TestICU(unittest.TestCase): self.ae((0, 13), icu.primary_no_punc_find("typographers", 'typographer’s')) self.ae((0, 7), icu.primary_no_punc_find('abcd', 'a\u00adb\u200cc\u200dd')) self.ae((0, 5), icu.primary_no_punc_find('abcd', 'ab cd')) + # test find all + m = [] + a = lambda p,l : m.append((p, l)) + icu.primary_collator_without_punctuation().find_all('a', 'a a🐱a', a) + self.ae(m, [(0, 1), (2, 1), (5, 1)]) def test_collation_order(self): 'Testing collation ordering'