diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 2ff1f33d6d..16b280b854 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -649,12 +649,26 @@ end: } // }}} // BreakIterator.split2 {{{ + +static inline void +unicode_code_point_count(UChar **count_start, int32_t *last_count, int *last_count32, int32_t *word_start, int32_t *sz) { + int32_t chars_to_new_word_from_last_pos = *word_start - *last_count; + int32_t sz32 = u_countChar32(*count_start + chars_to_new_word_from_last_pos, *sz); + int32_t codepoints_to_new_word_from_last_pos = u_countChar32(*count_start, chars_to_new_word_from_last_pos); + *count_start += chars_to_new_word_from_last_pos + *sz; + *last_count += chars_to_new_word_from_last_pos + *sz; + *last_count32 += codepoints_to_new_word_from_last_pos; + *word_start = *last_count32; + *last_count32 += sz32; + *sz = sz32; +} + static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { - int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; + int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0, last_count = 0, last_count32 = 0; int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0; - UChar sep = 0; + UChar sep = 0, *count_start = self->text; PyObject *ans = NULL, *temp = NULL, *t = NULL; ans = PyList_New(0); @@ -681,9 +695,8 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1; } last_pos = p; -#ifdef Py_UNICODE_WIDE - sz = u_countChar32(self->text + word_start, sz); - word_start = u_countChar32(self->text, word_start); +#if defined(Py_UNICODE_WIDE) || PY_MAJOR_VERSION > 2 + unicode_code_point_count(&count_start, &last_count, &last_count32, &word_start, &sz); #endif if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) { sz = last_sz + sz + trailing_hyphen; diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 50fe699bb0..ec9169c5f3 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -162,7 +162,7 @@ class TestICU(unittest.TestCase): def test_break_iterator(self): ' Test the break iterator ' - from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions + from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions, count_words for q in ('one two three', ' one two three', 'one\ntwo three ', ): self.ae(split(unicode_type(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split(u'I I\'m'), ['I', "I'm"]) @@ -171,6 +171,7 @@ class TestICU(unittest.TestCase): self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e']) self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e']) self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)]) + self.ae(count_words('a b c d e f'), 6) for needle, haystack, pos in ( ('word', 'a word b', 2), ('word', 'a word', 2),