mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-05-30 18:45:20 -04:00
E-book viewer: Fix incorrect search match offsets in normal search mode when the text contains non-BMP Unicode characters. Fixes #2152227 [search ignors some chars](https://bugs.launchpad.net/calibre/+bug/2152227)
This commit is contained in:
@@ -298,15 +298,15 @@ icu_Collator_find_all(icu_Collator *self, PyObject *const *args, Py_ssize_t narg
|
||||
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, whole_words ? self->word_iterator : NULL, &status);
|
||||
if (search && U_SUCCESS(status)) {
|
||||
pos = usearch_first(search, &status);
|
||||
int32_t pos_for_codepoint_count = 0;
|
||||
int32_t pos_for_codepoint_count = 0, utf32pos = 0;
|
||||
while (pos != USEARCH_DONE) {
|
||||
u_countChar32(b + pos_for_codepoint_count, pos - pos_for_codepoint_count);
|
||||
utf32pos += u_countChar32(b + pos_for_codepoint_count, pos - pos_for_codepoint_count);
|
||||
pos_for_codepoint_count = pos;
|
||||
length = usearch_getMatchedLength(search);
|
||||
length = u_countChar32(b + pos, length);
|
||||
PyObject *ret = PyObject_CallFunction(callback, "ii", pos, length);
|
||||
if (ret && ret == Py_None) pos = usearch_next(search, &status);
|
||||
else pos = USEARCH_DONE;
|
||||
PyObject *ret = PyObject_CallFunction(callback, "ii", utf32pos, length);
|
||||
if (ret == Py_None) pos = usearch_next(search, &status);
|
||||
else { pos = USEARCH_DONE; if (ret == NULL) PyErr_Clear(); }
|
||||
Py_CLEAR(ret);
|
||||
}
|
||||
} else PyErr_SetString(PyExc_ValueError, u_errorName(status));
|
||||
|
||||
@@ -135,8 +135,12 @@ class TestICU(unittest.TestCase):
|
||||
m = []
|
||||
def a(p, l):
|
||||
return m.append((p, l))
|
||||
haystack = 'a𝄞ShuffleX'
|
||||
icu.primary_collator_without_punctuation().find_all('shuffle', haystack, a)
|
||||
self.ae(haystack[m[0][0]:m[0][0] + m[0][1]], 'Shuffle')
|
||||
del m[:]
|
||||
icu.primary_collator_without_punctuation().find_all('a', 'a a🐱a', a)
|
||||
self.ae(m, [(0, 1), (2, 1), (5, 1)])
|
||||
self.ae(m, [(0, 1), (2, 1), (4, 1)])
|
||||
# test find whole words
|
||||
c = icu.primary_collator_without_punctuation()
|
||||
self.ae(c.find('a', 'abc a bc'), (0, 1))
|
||||
|
||||
Reference in New Issue
Block a user