diff --git a/src/calibre/gui2/complete2.py b/src/calibre/gui2/complete2.py index a2065376e5..c74d8ab6eb 100644 --- a/src/calibre/gui2/complete2.py +++ b/src/calibre/gui2/complete2.py @@ -33,7 +33,7 @@ from calibre.constants import ismacos from calibre.gui2.widgets import EnComboBox, LineEditECM from calibre.spell.break_iterator import get_word_break_iterator_for_ui_thread from calibre.utils.config import tweaks -from calibre.utils.icu import primary_collator, primary_contains, primary_find, primary_sort_key, primary_startswith, sort_key +from calibre.utils.icu import primary_collator, primary_contains, primary_find, primary_sort_key, primary_startswith, sort_key, word_prefix_find def containsq(x, prefix): @@ -44,14 +44,6 @@ def hierarchy_startswith(x, prefix, sep='.'): return primary_startswith(x, prefix) or primary_contains(sep + prefix, x) -def word_prefix_find(collator, it, x, prefix): - it.set_text(x) - for pos, size in it.split2(): - if collator.startswith(x, prefix, pos): - return pos - return -1 - - def word_prefix_matcher(collator, it, x, prefix): it.set_text(x) for pos, size in it.split2(): diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 3a9e3adfda..b9c2212cb3 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -1640,6 +1640,68 @@ icu_utf16_length(PyObject *self, PyObject *src) { return Py_BuildValue("n", sz); } // }}} +// word_prefix_find {{{ +// C implementation of word_prefix_find() from complete2.py. +// Converts python strings to ICU strings only once, then iterates over +// word positions and returns the first matching position or -1 on failure. +static PyObject * +icu_word_prefix_find(PyObject *self, PyObject *args) { + PyObject *collator_obj = NULL, *it_obj = NULL, *x_ = NULL, *prefix_ = NULL; + icu_Collator *collator = NULL; + icu_BreakIterator *it = NULL; + UChar *x_icu = NULL, *prefix_icu = NULL; + int32_t xsz = 0, prefix_sz = 0, pos, sz, utf16_start = 0, prev_cp_pos = 0; + UErrorCode status = U_ZERO_ERROR; + long ans = -1; + BreakIterState state; + + if (!PyArg_ParseTuple(args, "O!O!OO", + &icu_CollatorType, &collator_obj, + &icu_BreakIteratorType, &it_obj, + &x_, &prefix_)) return NULL; + collator = (icu_Collator *)collator_obj; + it = (icu_BreakIterator *)it_obj; + + // Convert x to ICU and set it on the break iterator (equivalent to it.set_text(x)) + x_icu = python_to_icu(x_, &xsz); + if (x_icu == NULL) return NULL; + it->counter++; + if (it->text != NULL) { free(it->text); it->text = NULL; it->text_len = 0; } + ubrk_setText(it->break_iterator, x_icu, xsz, &status); + if (U_FAILURE(status)) { + free(x_icu); + PyErr_SetString(PyExc_ValueError, u_errorName(status)); + return NULL; + } + it->text = x_icu; it->text_len = xsz; + x_icu = NULL; // ownership transferred to it->text + + // Convert prefix to ICU once + prefix_icu = python_to_icu(prefix_, &prefix_sz); + if (prefix_icu == NULL) return NULL; + + // Iterate over word positions and find the first where x starts with prefix + break_iter_state_init(it, &state); + while (break_iter_state_next(it, &state, &pos, &sz)) { + // pos is a codepoint offset; advance the UTF-16 cursor incrementally + if (pos > prev_cp_pos) { + U16_FWD_N(it->text, utf16_start, it->text_len, (uint32_t)(pos - prev_cp_pos)); + prev_cp_pos = pos; + } + if (utf16_start >= it->text_len) break; + // Empty prefix matches at the first word position + if (prefix_sz == 0) { ans = (long)pos; break; } + // Check if x starting at utf16_start begins with prefix using the collator + if (it->text_len - utf16_start >= prefix_sz && + ucol_equal(collator->collator, it->text + utf16_start, prefix_sz, prefix_icu, prefix_sz)) { + ans = (long)pos; + break; + } + } + free(prefix_icu); + return Py_BuildValue("l", ans); +} // }}} + // Module initialization {{{ static PyMethodDef icu_methods[] = { {"change_case", icu_change_case, METH_VARARGS, @@ -1698,6 +1760,10 @@ static PyMethodDef icu_methods[] = { "utf16_length(string) -> Return the length of a string (number of UTF-16 code points in the string). Useful on wide python builds where len() returns an incorrect answer if the string contains surrogate pairs." }, + {"word_prefix_find", icu_word_prefix_find, METH_VARARGS, + "word_prefix_find(collator, break_iterator, string, prefix) -> Return the codepoint offset of the first word in string that starts with prefix according to collator, or -1 if none." + }, + {NULL} /* Sentinel */ }; diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 3b85670adc..f5c8c53752 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -239,6 +239,7 @@ startswith = make_two_arg_func(collator, 'startswith') primary_startswith = make_two_arg_func(primary_collator, 'startswith') safe_chr = _icu.chr ord_string = _icu.ord_string +word_prefix_find = _icu.word_prefix_find def character_name(string): diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 0c5e9fc8d4..413efdb808 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -276,6 +276,35 @@ class TestICU(unittest.TestCase): fpos = index_of(needle, haystack) self.ae(pos, fpos, f'Failed to find index of {needle!r} in {haystack!r} ({pos} != {fpos})') + def test_word_prefix_find(self): + ' Test the C implementation of word_prefix_find ' + from calibre_extensions import icu as _icu + c = icu.primary_collator() + it = _icu.BreakIterator(_icu.UBRK_WORD, 'en') + wpf = _icu.word_prefix_find + # Basic prefix matches + self.ae(wpf(c, it, 'hello world', 'wo'), 6) + self.ae(wpf(c, it, 'hello world', 'he'), 0) + self.ae(wpf(c, it, 'hello world', 'world'), 6) + self.ae(wpf(c, it, 'hello world', 'hello'), 0) + # No match returns -1 + self.ae(wpf(c, it, 'hello world', 'xyz'), -1) + # Case-insensitive match with primary collator + self.ae(wpf(c, it, 'Hello World', 'wo'), 6) + self.ae(wpf(c, it, 'Hello World', 'he'), 0) + # Accents ignored with primary collator + self.ae(wpf(c, it, 'peña mundo', 'pen'), 0) + # Empty prefix matches first word + self.ae(wpf(c, it, 'hello world', ''), 0) + # Empty string returns -1 + self.ae(wpf(c, it, '', 'x'), -1) + self.ae(wpf(c, it, '', ''), -1) + # Surrogate pairs: emoji counts as 1 codepoint + self.ae(wpf(c, it, '\U0001f431 world', 'wo'), 2) + # Multiple calls reuse the iterator + self.ae(wpf(c, it, 'one two three', 'tw'), 4) + self.ae(wpf(c, it, 'one two three', 'th'), 8) + def test_remove_accents(self): for func in (icu.remove_accents_icu, icu.remove_accents_regex): for q, expected in {