mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit book: Spell checker: Treat hyphenated words as single words for spell checking
This commit is contained in:
parent
cdef0f3acf
commit
2f663a2e17
@ -20,8 +20,7 @@ def split_into_words(text, lang='en'):
|
|||||||
if it is None:
|
if it is None:
|
||||||
it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
|
it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
|
||||||
it.set_text(text)
|
it.set_text(text)
|
||||||
return it.split()
|
return [text[p:p+s] for p, s in it.split2()]
|
||||||
|
|
||||||
|
|
||||||
def split_into_words_and_positions(text, lang='en'):
|
def split_into_words_and_positions(text, lang='en'):
|
||||||
with _lock:
|
with _lock:
|
||||||
|
@ -595,37 +595,6 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *args, PyObject *kw
|
|||||||
|
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
// BreakIterator.split {{{
|
|
||||||
static PyObject *
|
|
||||||
icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
|
|
||||||
int32_t prev = 0, p = 0, sz = 0;
|
|
||||||
PyObject *ans = NULL, *token = NULL;
|
|
||||||
|
|
||||||
ans = PyList_New(0);
|
|
||||||
if (ans == NULL) return PyErr_NoMemory();
|
|
||||||
|
|
||||||
p = ubrk_first(self->break_iterator);
|
|
||||||
while (p != UBRK_DONE) {
|
|
||||||
prev = p; p = ubrk_next(self->break_iterator);
|
|
||||||
if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
|
|
||||||
continue; // We are not at the start of a word
|
|
||||||
sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
|
|
||||||
if (sz > 0) {
|
|
||||||
token = icu_to_python(self->text + prev, sz);
|
|
||||||
if (token == NULL) {
|
|
||||||
Py_DECREF(ans); ans = NULL; break;
|
|
||||||
}
|
|
||||||
if (PyList_Append(ans, token) != 0) {
|
|
||||||
Py_DECREF(token); Py_DECREF(ans); ans = NULL; break;
|
|
||||||
}
|
|
||||||
Py_DECREF(token);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ans;
|
|
||||||
|
|
||||||
} // }}}
|
|
||||||
|
|
||||||
// BreakIterator.index {{{
|
// BreakIterator.index {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
|
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
|
||||||
@ -673,8 +642,10 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwar
|
|||||||
#error Not implemented for python >= 3.3
|
#error Not implemented for python >= 3.3
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int32_t prev = 0, p = 0, sz = 0;
|
int32_t prev = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
|
||||||
PyObject *ans = NULL, *temp = NULL;
|
int is_hyphen_sep = 0;
|
||||||
|
UChar sep = 0;
|
||||||
|
PyObject *ans = NULL, *temp = NULL, *t = NULL;
|
||||||
|
|
||||||
ans = PyList_New(0);
|
ans = PyList_New(0);
|
||||||
if (ans == NULL) return PyErr_NoMemory();
|
if (ans == NULL) return PyErr_NoMemory();
|
||||||
@ -686,18 +657,38 @@ icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwar
|
|||||||
continue; // We are not at the start of a word
|
continue; // We are not at the start of a word
|
||||||
sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
|
sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
|
||||||
if (sz > 0) {
|
if (sz > 0) {
|
||||||
|
// ICU breaks on words containing hyphens, we do not want that, so we recombine manually
|
||||||
|
is_hyphen_sep = 0;
|
||||||
|
if (last_pos > 0) {
|
||||||
|
if (prev - last_pos == 1) {
|
||||||
|
sep = *(self->text + last_pos);
|
||||||
|
if (sep == 0x2d || sep == 0x2010) is_hyphen_sep = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
last_pos = p;
|
||||||
#ifdef Py_UNICODE_WIDE
|
#ifdef Py_UNICODE_WIDE
|
||||||
sz = u_countChar32(self->text + prev, sz);
|
sz = u_countChar32(self->text + prev, sz);
|
||||||
prev = u_countChar32(self->text, prev);
|
prev = u_countChar32(self->text, prev);
|
||||||
#endif
|
#endif
|
||||||
temp = Py_BuildValue("II", prev, sz);
|
if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
|
||||||
if (temp == NULL) {
|
sz = last_sz + sz + 1;
|
||||||
Py_DECREF(ans); ans = NULL; break;
|
last_sz = sz;
|
||||||
}
|
t = PyInt_FromLong((long)sz);
|
||||||
if (PyList_Append(ans, temp) != 0) {
|
if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
|
||||||
Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break;
|
temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1);
|
||||||
|
Py_DECREF(PyTuple_GET_ITEM(temp, 1));
|
||||||
|
PyTuple_SET_ITEM(temp, 1, t);
|
||||||
|
} else {
|
||||||
|
last_sz = sz;
|
||||||
|
temp = Py_BuildValue("II", (unsigned int)prev, (unsigned int)sz);
|
||||||
|
if (temp == NULL) {
|
||||||
|
Py_DECREF(ans); ans = NULL; break;
|
||||||
|
}
|
||||||
|
if (PyList_Append(ans, temp) != 0) {
|
||||||
|
Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break;
|
||||||
|
}
|
||||||
|
Py_DECREF(temp);
|
||||||
}
|
}
|
||||||
Py_DECREF(temp);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -710,10 +701,6 @@ static PyMethodDef icu_BreakIterator_methods[] = {
|
|||||||
"set_text(unicode object) -> Set the text this iterator will operate on"
|
"set_text(unicode object) -> Set the text this iterator will operate on"
|
||||||
},
|
},
|
||||||
|
|
||||||
{"split", (PyCFunction)icu_BreakIterator_split, METH_VARARGS,
|
|
||||||
"split() -> Split the current text into tokens, returning a list of tokens"
|
|
||||||
},
|
|
||||||
|
|
||||||
{"split2", (PyCFunction)icu_BreakIterator_split2, METH_VARARGS,
|
{"split2", (PyCFunction)icu_BreakIterator_split2, METH_VARARGS,
|
||||||
"split2() -> Split the current text into tokens, returning a list of 2-tuples of the form (position of token, length of token). The numbers are suitable for indexing python strings regardless of narrow/wide builds."
|
"split2() -> Split the current text into tokens, returning a list of 2-tuples of the form (position of token, length of token). The numbers are suitable for indexing python strings regardless of narrow/wide builds."
|
||||||
},
|
},
|
||||||
|
@ -151,9 +151,11 @@ class TestICU(unittest.TestCase):
|
|||||||
def test_break_iterator(self):
|
def test_break_iterator(self):
|
||||||
' Test the break iterator '
|
' Test the break iterator '
|
||||||
from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions
|
from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions
|
||||||
for q in ('one two three', ' one two three', 'one\ntwo three ', 'one-two,three'):
|
for q in ('one two three', ' one two three', 'one\ntwo three ', ):
|
||||||
self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
|
self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
|
||||||
self.ae(split(u'I I\'m'), ['I', "I'm"])
|
self.ae(split(u'I I\'m'), ['I', "I'm"])
|
||||||
|
self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
|
||||||
|
self.ae(split(u'-one two-'), ['one', 'two'])
|
||||||
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)])
|
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)])
|
||||||
self.ae(0, index_of('i', 'i'))
|
self.ae(0, index_of('i', 'i'))
|
||||||
self.ae(4, index_of('i', 'six i'))
|
self.ae(4, index_of('i', 'six i'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user