Implement word_prefix_find() in C in the icu.c module

Fixes #3046
2026-04-02 23:31:59 -04:00 · 2026-03-16 07:41:29 +00:00 · 2026-03-16 07:41:29 +00:00 · 267a89a0dd
commit 267a89a0dd
parent ed76e7ad7c
4 changed files with 97 additions and 9 deletions
--- a/src/calibre/gui2/complete2.py
+++ b/src/calibre/gui2/complete2.py
@ -33,7 +33,7 @@ from calibre.constants import ismacos
 from calibre.gui2.widgets import EnComboBox, LineEditECM
 from calibre.spell.break_iterator import get_word_break_iterator_for_ui_thread
 from calibre.utils.config import tweaks
-from calibre.utils.icu import primary_collator, primary_contains, primary_find, primary_sort_key, primary_startswith, sort_key
+from calibre.utils.icu import primary_collator, primary_contains, primary_find, primary_sort_key, primary_startswith, sort_key, word_prefix_find


 def containsq(x, prefix):
@ -44,14 +44,6 @@ def hierarchy_startswith(x, prefix, sep='.'):
    return primary_startswith(x, prefix) or primary_contains(sep + prefix, x)


-def word_prefix_find(collator, it, x, prefix):
-    it.set_text(x)
-    for pos, size in it.split2():
-        if collator.startswith(x, prefix, pos):
-            return pos
-    return -1
-
-
 def word_prefix_matcher(collator, it, x, prefix):
    it.set_text(x)
    for pos, size in it.split2():
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -1640,6 +1640,68 @@ icu_utf16_length(PyObject *self, PyObject *src) {
    return Py_BuildValue("n", sz);
 } // }}}

+// word_prefix_find {{{
+// C implementation of word_prefix_find() from complete2.py.
+// Converts python strings to ICU strings only once, then iterates over
+// word positions and returns the first matching position or -1 on failure.
+static PyObject *
+icu_word_prefix_find(PyObject *self, PyObject *args) {
+    PyObject *collator_obj = NULL, *it_obj = NULL, *x_ = NULL, *prefix_ = NULL;
+    icu_Collator *collator = NULL;
+    icu_BreakIterator *it = NULL;
+    UChar *x_icu = NULL, *prefix_icu = NULL;
+    int32_t xsz = 0, prefix_sz = 0, pos, sz, utf16_start = 0, prev_cp_pos = 0;
+    UErrorCode status = U_ZERO_ERROR;
+    long ans = -1;
+    BreakIterState state;
+
+    if (!PyArg_ParseTuple(args, "O!O!OO",
+                          &icu_CollatorType, &collator_obj,
+                          &icu_BreakIteratorType, &it_obj,
+                          &x_, &prefix_)) return NULL;
+    collator = (icu_Collator *)collator_obj;
+    it = (icu_BreakIterator *)it_obj;
+
+    // Convert x to ICU and set it on the break iterator (equivalent to it.set_text(x))
+    x_icu = python_to_icu(x_, &xsz);
+    if (x_icu == NULL) return NULL;
+    it->counter++;
+    if (it->text != NULL) { free(it->text); it->text = NULL; it->text_len = 0; }
+    ubrk_setText(it->break_iterator, x_icu, xsz, &status);
+    if (U_FAILURE(status)) {
+        free(x_icu);
+        PyErr_SetString(PyExc_ValueError, u_errorName(status));
+        return NULL;
+    }
+    it->text = x_icu; it->text_len = xsz;
+    x_icu = NULL;  // ownership transferred to it->text
+
+    // Convert prefix to ICU once
+    prefix_icu = python_to_icu(prefix_, &prefix_sz);
+    if (prefix_icu == NULL) return NULL;
+
+    // Iterate over word positions and find the first where x starts with prefix
+    break_iter_state_init(it, &state);
+    while (break_iter_state_next(it, &state, &pos, &sz)) {
+        // pos is a codepoint offset; advance the UTF-16 cursor incrementally
+        if (pos > prev_cp_pos) {
+            U16_FWD_N(it->text, utf16_start, it->text_len, (uint32_t)(pos - prev_cp_pos));
+            prev_cp_pos = pos;
+        }
+        if (utf16_start >= it->text_len) break;
+        // Empty prefix matches at the first word position
+        if (prefix_sz == 0) { ans = (long)pos; break; }
+        // Check if x starting at utf16_start begins with prefix using the collator
+        if (it->text_len - utf16_start >= prefix_sz &&
+                ucol_equal(collator->collator, it->text + utf16_start, prefix_sz, prefix_icu, prefix_sz)) {
+            ans = (long)pos;
+            break;
+        }
+    }
+    free(prefix_icu);
+    return Py_BuildValue("l", ans);
+} // }}}
+
 // Module initialization {{{
 static PyMethodDef icu_methods[] = {
    {"change_case", icu_change_case, METH_VARARGS,
@ -1698,6 +1760,10 @@ static PyMethodDef icu_methods[] = {
     "utf16_length(string) -> Return the length of a string (number of UTF-16 code points in the string). Useful on wide python builds where len() returns an incorrect answer if the string contains surrogate pairs."
    },

+    {"word_prefix_find", icu_word_prefix_find, METH_VARARGS,
+     "word_prefix_find(collator, break_iterator, string, prefix) -> Return the codepoint offset of the first word in string that starts with prefix according to collator, or -1 if none."
+    },
+
    {NULL}  /* Sentinel */
 };

--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -239,6 +239,7 @@ startswith = make_two_arg_func(collator, 'startswith')
 primary_startswith = make_two_arg_func(primary_collator, 'startswith')
 safe_chr = _icu.chr
 ord_string = _icu.ord_string
+word_prefix_find = _icu.word_prefix_find


 def character_name(string):
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -276,6 +276,35 @@ class TestICU(unittest.TestCase):
            fpos = index_of(needle, haystack)
            self.ae(pos, fpos, f'Failed to find index of {needle!r} in {haystack!r} ({pos} != {fpos})')

+    def test_word_prefix_find(self):
+        ' Test the C implementation of word_prefix_find '
+        from calibre_extensions import icu as _icu
+        c = icu.primary_collator()
+        it = _icu.BreakIterator(_icu.UBRK_WORD, 'en')
+        wpf = _icu.word_prefix_find
+        # Basic prefix matches
+        self.ae(wpf(c, it, 'hello world', 'wo'), 6)
+        self.ae(wpf(c, it, 'hello world', 'he'), 0)
+        self.ae(wpf(c, it, 'hello world', 'world'), 6)
+        self.ae(wpf(c, it, 'hello world', 'hello'), 0)
+        # No match returns -1
+        self.ae(wpf(c, it, 'hello world', 'xyz'), -1)
+        # Case-insensitive match with primary collator
+        self.ae(wpf(c, it, 'Hello World', 'wo'), 6)
+        self.ae(wpf(c, it, 'Hello World', 'he'), 0)
+        # Accents ignored with primary collator
+        self.ae(wpf(c, it, 'peña mundo', 'pen'), 0)
+        # Empty prefix matches first word
+        self.ae(wpf(c, it, 'hello world', ''), 0)
+        # Empty string returns -1
+        self.ae(wpf(c, it, '', 'x'), -1)
+        self.ae(wpf(c, it, '', ''), -1)
+        # Surrogate pairs: emoji counts as 1 codepoint
+        self.ae(wpf(c, it, '\U0001f431 world', 'wo'), 2)
+        # Multiple calls reuse the iterator
+        self.ae(wpf(c, it, 'one two three', 'tw'), 4)
+        self.ae(wpf(c, it, 'one two three', 'th'), 8)
+
    def test_remove_accents(self):
        for func in (icu.remove_accents_icu, icu.remove_accents_regex):
            for q, expected in {