E-book viewer: Fix incorrect search match offsets in normal search mode when the text contains non-BMP Unicode characters. Fixes #2152227 [search ignors some chars](https://bugs.launchpad.net/calibre/+bug/2152227)

This commit is contained in:
Kovid Goyal
2026-05-13 15:46:02 +05:30
parent 801af5a41d
commit cf3a6f4e6f
2 changed files with 10 additions and 6 deletions
+5 -5
View File
@@ -298,15 +298,15 @@ icu_Collator_find_all(icu_Collator *self, PyObject *const *args, Py_ssize_t narg
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, whole_words ? self->word_iterator : NULL, &status);
if (search && U_SUCCESS(status)) {
pos = usearch_first(search, &status);
int32_t pos_for_codepoint_count = 0;
int32_t pos_for_codepoint_count = 0, utf32pos = 0;
while (pos != USEARCH_DONE) {
u_countChar32(b + pos_for_codepoint_count, pos - pos_for_codepoint_count);
utf32pos += u_countChar32(b + pos_for_codepoint_count, pos - pos_for_codepoint_count);
pos_for_codepoint_count = pos;
length = usearch_getMatchedLength(search);
length = u_countChar32(b + pos, length);
PyObject *ret = PyObject_CallFunction(callback, "ii", pos, length);
if (ret && ret == Py_None) pos = usearch_next(search, &status);
else pos = USEARCH_DONE;
PyObject *ret = PyObject_CallFunction(callback, "ii", utf32pos, length);
if (ret == Py_None) pos = usearch_next(search, &status);
else { pos = USEARCH_DONE; if (ret == NULL) PyErr_Clear(); }
Py_CLEAR(ret);
}
} else PyErr_SetString(PyExc_ValueError, u_errorName(status));
+5 -1
View File
@@ -135,8 +135,12 @@ class TestICU(unittest.TestCase):
m = []
def a(p, l):
return m.append((p, l))
haystack = 'a𝄞ShuffleX'
icu.primary_collator_without_punctuation().find_all('shuffle', haystack, a)
self.ae(haystack[m[0][0]:m[0][0] + m[0][1]], 'Shuffle')
del m[:]
icu.primary_collator_without_punctuation().find_all('a', 'a a🐱a', a)
self.ae(m, [(0, 1), (2, 1), (5, 1)])
self.ae(m, [(0, 1), (2, 1), (4, 1)])
# test find whole words
c = icu.primary_collator_without_punctuation()
self.ae(c.find('a', 'abc a bc'), (0, 1))