diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 5ae1665578..9ce1a15bdc 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -603,7 +603,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwarg #endif UChar *buf = NULL; - int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1; + int32_t prev = 0, p = 0, sz = 0, ans = -1; PyObject *token = NULL; if (!PyArg_ParseTuple(args, "O", &token)) return NULL; @@ -617,21 +617,26 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwarg prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word - tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; - if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { -#ifdef PY_UNICODE_WIDE - ans = u_countChar32(self->text, prev); -#else - ans = prev; -#endif - break; + if (self->text_len >= prev + sz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { + // Needle is present at text[prev:] we have to check if it is followed by a non-hyphen boundary + if( + ubrk_isBoundary(self->break_iterator, prev + sz) && + (self->text_len == prev + sz || (self->text[prev + sz] != 0x2d && self->text[prev + sz] != 0x2010)) + ) { + ans = prev; break; // Found word surrounded by non-hyphen boundaries + } + if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary } } +#ifdef Py_UNICODE_WIDE + if (ans > 0) ans = u_countChar32(self->text, ans); +#endif Py_END_ALLOW_THREADS; + end: free(buf); - return Py_BuildValue("i", ans); + return Py_BuildValue("l", (long int)ans); } // }}} diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index faf7e3834a..4f7a474dba 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -156,13 +156,24 @@ class TestICU(unittest.TestCase): self.ae(split(u'I I\'m'), ['I', "I'm"]) self.ae(split(u'out-of-the-box'), ['out-of-the-box']) self.ae(split(u'-one two-'), ['one', 'two']) - self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)]) - self.ae(0, index_of('i', 'i')) - self.ae(4, index_of('i', 'six i')) - self.ae(-1, index_of('i', '')) - self.ae(-1, index_of('', '')) - self.ae(-1, index_of('', 'i')) - self.ae(-1, index_of('i', 'six clicks')) + self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)]) + for needle, haystack, pos in ( + ('word', 'a word b', 2), + ('word', 'a word', 2), + ('one-two', 'a one-two punch', 2), + ('one-two', 'one-two punch', 0), + ('one-two', 'one-two', 0), + ('one', 'one-two one', 8), + ('one-two', 'one-two-three one-two', 14), + ('one', 'onet one', 5), + ('i', 'i', 0), + ('i', 'six i', 4), + ('i', '', -1), ('', '', -1), ('', 'i', -1), + ('i', 'six clicks', -1), + ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)), + ): + fpos = index_of(needle, haystack) + self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos)) class TestRunner(unittest.main):