mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Change O(n^2) algorithm to O(n) when splitting using the ICU break iterator
This commit is contained in:
parent
f3a877f404
commit
3cc55b16fe
@ -649,12 +649,26 @@ end:
|
||||
} // }}}
|
||||
|
||||
// BreakIterator.split2 {{{
|
||||
|
||||
static inline void
|
||||
unicode_code_point_count(UChar **count_start, int32_t *last_count, int *last_count32, int32_t *word_start, int32_t *sz) {
|
||||
int32_t chars_to_new_word_from_last_pos = *word_start - *last_count;
|
||||
int32_t sz32 = u_countChar32(*count_start + chars_to_new_word_from_last_pos, *sz);
|
||||
int32_t codepoints_to_new_word_from_last_pos = u_countChar32(*count_start, chars_to_new_word_from_last_pos);
|
||||
*count_start += chars_to_new_word_from_last_pos + *sz;
|
||||
*last_count += chars_to_new_word_from_last_pos + *sz;
|
||||
*last_count32 += codepoints_to_new_word_from_last_pos;
|
||||
*word_start = *last_count32;
|
||||
*last_count32 += sz32;
|
||||
*sz = sz32;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
||||
|
||||
int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
|
||||
int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0, last_count = 0, last_count32 = 0;
|
||||
int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
|
||||
UChar sep = 0;
|
||||
UChar sep = 0, *count_start = self->text;
|
||||
PyObject *ans = NULL, *temp = NULL, *t = NULL;
|
||||
|
||||
ans = PyList_New(0);
|
||||
@ -681,9 +695,8 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
|
||||
if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
|
||||
}
|
||||
last_pos = p;
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
sz = u_countChar32(self->text + word_start, sz);
|
||||
word_start = u_countChar32(self->text, word_start);
|
||||
#if defined(Py_UNICODE_WIDE) || PY_MAJOR_VERSION > 2
|
||||
unicode_code_point_count(&count_start, &last_count, &last_count32, &word_start, &sz);
|
||||
#endif
|
||||
if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
|
||||
sz = last_sz + sz + trailing_hyphen;
|
||||
|
@ -162,7 +162,7 @@ class TestICU(unittest.TestCase):
|
||||
|
||||
def test_break_iterator(self):
|
||||
' Test the break iterator '
|
||||
from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions
|
||||
from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions, count_words
|
||||
for q in ('one two three', ' one two three', 'one\ntwo three ', ):
|
||||
self.ae(split(unicode_type(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
|
||||
self.ae(split(u'I I\'m'), ['I', "I'm"])
|
||||
@ -171,6 +171,7 @@ class TestICU(unittest.TestCase):
|
||||
self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
|
||||
self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
|
||||
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
|
||||
self.ae(count_words('a b c d e f'), 6)
|
||||
for needle, haystack, pos in (
|
||||
('word', 'a word b', 2),
|
||||
('word', 'a word', 2),
|
||||
|
Loading…
x
Reference in New Issue
Block a user