From fa9b43f7f13db99d67605cdaf7a45255474e8b06 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 May 2014 19:11:50 +0530 Subject: [PATCH] Add a break iterator that returns indices into the string instead of tokens in the string --- src/calibre/spell/break_iterator.py | 10 +++++++ src/calibre/utils/icu.c | 43 +++++++++++++++++++++++++++++ src/calibre/utils/icu_test.py | 3 +- 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py index 16f4da4110..8480a73569 100644 --- a/src/calibre/spell/break_iterator.py +++ b/src/calibre/spell/break_iterator.py @@ -22,6 +22,16 @@ def split_into_words(text, lang='en'): it.set_text(text) return it.split() + +def split_into_words_and_positions(text, lang='en'): + with _lock: + it = _iterators.get(lang, None) + if it is None: + it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang) + it.set_text(text) + return it.split2() + + def index_of(needle, haystack, lang='en'): with _lock: it = _iterators.get(lang, None) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index ec46aeb0d8..f2506fd78b 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -666,6 +666,45 @@ end: } // }}} +// BreakIterator.split2 {{{ +static PyObject * +icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + + int32_t prev = 0, p = 0, sz = 0; + PyObject *ans = NULL, *temp = NULL; + + ans = PyList_New(0); + if (ans == NULL) return PyErr_NoMemory(); + + p = ubrk_first(self->break_iterator); + while (p != UBRK_DONE) { + prev = p; p = ubrk_next(self->break_iterator); + if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) + continue; // We are not at the start of a word + sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; + if (sz > 0) { +#ifdef Py_UNICODE_WIDE + sz = u_countChar32(self->text + prev, sz); + prev = u_countChar32(self->text, prev); +#endif + temp = Py_BuildValue("II", prev, sz); + if (temp == NULL) { + Py_DECREF(ans); ans = NULL; break; + } + if (PyList_Append(ans, temp) != 0) { + Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; + } + Py_DECREF(temp); + } + } + + return ans; + +} // }}} + static PyMethodDef icu_BreakIterator_methods[] = { {"set_text", (PyCFunction)icu_BreakIterator_set_text, METH_VARARGS, "set_text(unicode object) -> Set the text this iterator will operate on" @@ -675,6 +714,10 @@ static PyMethodDef icu_BreakIterator_methods[] = { "split() -> Split the current text into tokens, returning a list of tokens" }, + {"split2", (PyCFunction)icu_BreakIterator_split2, METH_VARARGS, + "split2() -> Split the current text into tokens, returning a list of 2-tuples of the form (position of token, length of token). The numbers are suitable for indexing python strings regardless of narrow/wide builds." + }, + {"index", (PyCFunction)icu_BreakIterator_index, METH_VARARGS, "index(token) -> Find the index of the first match for token. Useful to find, for example, words that could also be a part of a larger word. For example, index('i') in 'string i' will be 7 not 3. Returns -1 if not found." }, diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 83cce844df..8d42c59efa 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -150,10 +150,11 @@ class TestICU(unittest.TestCase): def test_break_iterator(self): ' Test the break iterator ' - from calibre.spell.break_iterator import split_into_words as split, index_of + from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions for q in ('one two three', ' one two three', 'one\ntwo three ', 'one-two,three'): self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split(u'I I\'m'), ['I', "I'm"]) + self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)]) self.ae(0, index_of('i', 'i')) self.ae(4, index_of('i', 'six i')) self.ae(-1, index_of('i', ''))