From 2f663a2e1746cde6c99982eca34d019425b20677 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 8 Jul 2014 11:45:45 +0530 Subject: [PATCH] Edit book: Spell checker: Treat hyphenated words as single words for spell checking --- src/calibre/spell/break_iterator.py | 3 +- src/calibre/utils/icu.c | 75 ++++++++++++----------------- src/calibre/utils/icu_test.py | 4 +- 3 files changed, 35 insertions(+), 47 deletions(-) diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py index 8480a73569..a615a5fced 100644 --- a/src/calibre/spell/break_iterator.py +++ b/src/calibre/spell/break_iterator.py @@ -20,8 +20,7 @@ def split_into_words(text, lang='en'): if it is None: it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang) it.set_text(text) - return it.split() - + return [text[p:p+s] for p, s in it.split2()] def split_into_words_and_positions(text, lang='en'): with _lock: diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index f2506fd78b..5ae1665578 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -595,37 +595,6 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *args, PyObject *kw } // }}} -// BreakIterator.split {{{ -static PyObject * -icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { - int32_t prev = 0, p = 0, sz = 0; - PyObject *ans = NULL, *token = NULL; - - ans = PyList_New(0); - if (ans == NULL) return PyErr_NoMemory(); - - p = ubrk_first(self->break_iterator); - while (p != UBRK_DONE) { - prev = p; p = ubrk_next(self->break_iterator); - if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) - continue; // We are not at the start of a word - sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; - if (sz > 0) { - token = icu_to_python(self->text + prev, sz); - if (token == NULL) { - Py_DECREF(ans); ans = NULL; break; - } - if (PyList_Append(ans, token) != 0) { - Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; - } - Py_DECREF(token); - } - } - - return ans; - -} // }}} - // BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { @@ -673,8 +642,10 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwar #error Not implemented for python >= 3.3 #endif - int32_t prev = 0, p = 0, sz = 0; - PyObject *ans = NULL, *temp = NULL; + int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; + int is_hyphen_sep = 0; + UChar sep = 0; + PyObject *ans = NULL, *temp = NULL, *t = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); @@ -686,18 +657,38 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwar continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz > 0) { + // ICU breaks on words containing hyphens, we do not want that, so we recombine manually + is_hyphen_sep = 0; + if (last_pos > 0) { + if (prev - last_pos == 1) { + sep = *(self->text + last_pos); + if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1; + } + } + last_pos = p; #ifdef Py_UNICODE_WIDE sz = u_countChar32(self->text + prev, sz); prev = u_countChar32(self->text, prev); #endif - temp = Py_BuildValue("II", prev, sz); - if (temp == NULL) { - Py_DECREF(ans); ans = NULL; break; - } - if (PyList_Append(ans, temp) != 0) { - Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; + if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) { + sz = last_sz + sz + 1; + last_sz = sz; + t = PyInt_FromLong((long)sz); + if (t == NULL) { Py_DECREF(ans); ans = NULL; break; } + temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1); + Py_DECREF(PyTuple_GET_ITEM(temp, 1)); + PyTuple_SET_ITEM(temp, 1, t); + } else { + last_sz = sz; + temp = Py_BuildValue("II", (unsigned int)prev, (unsigned int)sz); + if (temp == NULL) { + Py_DECREF(ans); ans = NULL; break; + } + if (PyList_Append(ans, temp) != 0) { + Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; + } + Py_DECREF(temp); } - Py_DECREF(temp); } } @@ -710,10 +701,6 @@ static PyMethodDef icu_BreakIterator_methods[] = { "set_text(unicode object) -> Set the text this iterator will operate on" }, - {"split", (PyCFunction)icu_BreakIterator_split, METH_VARARGS, - "split() -> Split the current text into tokens, returning a list of tokens" - }, - {"split2", (PyCFunction)icu_BreakIterator_split2, METH_VARARGS, "split2() -> Split the current text into tokens, returning a list of 2-tuples of the form (position of token, length of token). The numbers are suitable for indexing python strings regardless of narrow/wide builds." }, diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 8d42c59efa..faf7e3834a 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -151,9 +151,11 @@ class TestICU(unittest.TestCase): def test_break_iterator(self): ' Test the break iterator ' from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions - for q in ('one two three', ' one two three', 'one\ntwo three ', 'one-two,three'): + for q in ('one two three', ' one two three', 'one\ntwo three ', ): self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split(u'I I\'m'), ['I', "I'm"]) + self.ae(split(u'out-of-the-box'), ['out-of-the-box']) + self.ae(split(u'-one two-'), ['one', 'two']) self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)]) self.ae(0, index_of('i', 'i')) self.ae(4, index_of('i', 'six i'))