mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a break iterator that returns indices into the string instead of tokens in the string
This commit is contained in:
parent
c0b0df3160
commit
fa9b43f7f1
@ -22,6 +22,16 @@ def split_into_words(text, lang='en'):
|
||||
it.set_text(text)
|
||||
return it.split()
|
||||
|
||||
|
||||
def split_into_words_and_positions(text, lang='en'):
|
||||
with _lock:
|
||||
it = _iterators.get(lang, None)
|
||||
if it is None:
|
||||
it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
|
||||
it.set_text(text)
|
||||
return it.split2()
|
||||
|
||||
|
||||
def index_of(needle, haystack, lang='en'):
|
||||
with _lock:
|
||||
it = _iterators.get(lang, None)
|
||||
|
@ -666,6 +666,45 @@ end:
|
||||
|
||||
} // }}}
|
||||
|
||||
// BreakIterator.split2 {{{
|
||||
static PyObject *
|
||||
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
|
||||
#if PY_VERSION_HEX >= 0x03030000
|
||||
#error Not implemented for python >= 3.3
|
||||
#endif
|
||||
|
||||
int32_t prev = 0, p = 0, sz = 0;
|
||||
PyObject *ans = NULL, *temp = NULL;
|
||||
|
||||
ans = PyList_New(0);
|
||||
if (ans == NULL) return PyErr_NoMemory();
|
||||
|
||||
p = ubrk_first(self->break_iterator);
|
||||
while (p != UBRK_DONE) {
|
||||
prev = p; p = ubrk_next(self->break_iterator);
|
||||
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
||||
continue; // We are not at the start of a word
|
||||
sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
|
||||
if (sz > 0) {
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
sz = u_countChar32(self->text + prev, sz);
|
||||
prev = u_countChar32(self->text, prev);
|
||||
#endif
|
||||
temp = Py_BuildValue("II", prev, sz);
|
||||
if (temp == NULL) {
|
||||
Py_DECREF(ans); ans = NULL; break;
|
||||
}
|
||||
if (PyList_Append(ans, temp) != 0) {
|
||||
Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break;
|
||||
}
|
||||
Py_DECREF(temp);
|
||||
}
|
||||
}
|
||||
|
||||
return ans;
|
||||
|
||||
} // }}}
|
||||
|
||||
static PyMethodDef icu_BreakIterator_methods[] = {
|
||||
{"set_text", (PyCFunction)icu_BreakIterator_set_text, METH_VARARGS,
|
||||
"set_text(unicode object) -> Set the text this iterator will operate on"
|
||||
@ -675,6 +714,10 @@ static PyMethodDef icu_BreakIterator_methods[] = {
|
||||
"split() -> Split the current text into tokens, returning a list of tokens"
|
||||
},
|
||||
|
||||
{"split2", (PyCFunction)icu_BreakIterator_split2, METH_VARARGS,
|
||||
"split2() -> Split the current text into tokens, returning a list of 2-tuples of the form (position of token, length of token). The numbers are suitable for indexing python strings regardless of narrow/wide builds."
|
||||
},
|
||||
|
||||
{"index", (PyCFunction)icu_BreakIterator_index, METH_VARARGS,
|
||||
"index(token) -> Find the index of the first match for token. Useful to find, for example, words that could also be a part of a larger word. For example, index('i') in 'string i' will be 7 not 3. Returns -1 if not found."
|
||||
},
|
||||
|
@ -150,10 +150,11 @@ class TestICU(unittest.TestCase):
|
||||
|
||||
def test_break_iterator(self):
|
||||
' Test the break iterator '
|
||||
from calibre.spell.break_iterator import split_into_words as split, index_of
|
||||
from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions
|
||||
for q in ('one two three', ' one two three', 'one\ntwo three ', 'one-two,three'):
|
||||
self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
|
||||
self.ae(split(u'I I\'m'), ['I', "I'm"])
|
||||
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)])
|
||||
self.ae(0, index_of('i', 'i'))
|
||||
self.ae(4, index_of('i', 'six i'))
|
||||
self.ae(-1, index_of('i', ''))
|
||||
|
Loading…
x
Reference in New Issue
Block a user