diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py new file mode 100644 index 0000000000..5cca65630e --- /dev/null +++ b/src/calibre/spell/break_iterator.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +from threading import Lock + +from calibre.utils.icu import _icu +from calibre.utils.localization import lang_as_iso639_1 + +_iterators = {} +_lock = Lock() + +has_break_iterator = hasattr(_icu, 'BreakIterator') + +def split_into_words(text, lang='en'): + with _lock: + it = _iterators.get(lang, None) + if it is None: + it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang) + it.set_text(text) + return it.split() + +def index_of(needle, haystack, lang='en'): + with _lock: + it = _iterators.get(lang, None) + if it is None: + it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang) + it.set_text(haystack) + return it.index(needle) + diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 22c9bbb811..29dc91e1e9 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -528,6 +528,202 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs) // }}} +// BreakIterator object definition {{{ +typedef struct { + PyObject_HEAD + // Type-specific fields go here. + UBreakIterator *break_iterator; + UChar *text; + int32_t text_len; + UBreakIteratorType type; + +} icu_BreakIterator; + +static void +icu_BreakIterator_dealloc(icu_BreakIterator* self) +{ + if (self->break_iterator != NULL) ubrk_close(self->break_iterator); + if (self->text != NULL) free(self->text); + self->break_iterator = NULL; self->text = NULL; + self->ob_type->tp_free((PyObject*)self); +} + + +static PyObject * +icu_BreakIterator_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + icu_BreakIterator *self = NULL; + const char *locale = NULL; + int break_iterator_type = UBRK_WORD; + UErrorCode status = U_ZERO_ERROR; + UBreakIterator *break_iterator; + + if (!PyArg_ParseTuple(args, "is", &break_iterator_type, &locale)) return NULL; + break_iterator = ubrk_open(break_iterator_type, locale, NULL, 0, &status); + if (break_iterator == NULL || U_FAILURE(status)) { + PyErr_SetString(PyExc_ValueError, u_errorName(status)); + return NULL; + } + + self = (icu_BreakIterator *)type->tp_alloc(type, 0); + if (self != NULL) { + self->break_iterator = break_iterator; + } + self->text = NULL; self->text_len = 0; self->type = break_iterator_type; + + return (PyObject *)self; +} + +// BreakIterator.set_text {{{ +static PyObject * +icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { + int32_t sz = 0; + UChar *buf = NULL; + UErrorCode status = U_ZERO_ERROR; + PyObject *input = NULL; + + if (!PyArg_ParseTuple(args, "O", &input)) return NULL; + buf = python_to_icu(input, &sz, 1); + if (buf == NULL) return NULL; + ubrk_setText(self->break_iterator, buf, sz, &status); + if (U_FAILURE(status)) { + free(buf); buf = NULL; + PyErr_SetString(PyExc_ValueError, u_errorName(status)); + } else { self->text = buf; self->text_len = sz; } + + Py_RETURN_NONE; + +} // }}} + +// BreakIterator.split {{{ +static PyObject * +icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { + int32_t prev = 0, p = 0, sz = 0; + PyObject *ans = NULL, *token = NULL; + + ans = PyList_New(0); + if (ans == NULL) return PyErr_NoMemory(); + + p = ubrk_first(self->break_iterator); + while (p != UBRK_DONE) { + prev = p; p = ubrk_next(self->break_iterator); + if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) + continue; // We are not at the start of a word + sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; + if (sz > 0) { + token = icu_to_python(self->text + prev, sz); + if (token == NULL) { + Py_DECREF(ans); ans = NULL; break; + } + if (PyList_Append(ans, token) != 0) { + Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; + } + Py_DECREF(token); + } + } + + return ans; + +} // }}} + +// BreakIterator.index {{{ +static PyObject * +icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + + UChar *buf = NULL; + int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1; + PyObject *token = NULL; + + if (!PyArg_ParseTuple(args, "O", &token)) return NULL; + buf = python_to_icu(token, &sz, 1); + if (buf == NULL) return NULL; + if (sz < 1) goto end; + + p = ubrk_first(self->break_iterator); + while (p != UBRK_DONE) { + prev = p; p = ubrk_next(self->break_iterator); + if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) + continue; // We are not at the start of a word + tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; + if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { +#ifdef PY_UNICODE_WIDE + ans = u_countChar32(self->text, prev); +#else + ans = prev; +#endif + break; + } + } + +end: + free(buf); + return Py_BuildValue("i", ans); + +} // }}} + +static PyMethodDef icu_BreakIterator_methods[] = { + {"set_text", (PyCFunction)icu_BreakIterator_set_text, METH_VARARGS, + "set_text(unicode object) -> Set the text this iterator will operate on" + }, + + {"split", (PyCFunction)icu_BreakIterator_split, METH_VARARGS, + "split() -> Split the current text into tokens, returning a list of tokens" + }, + + {"index", (PyCFunction)icu_BreakIterator_index, METH_VARARGS, + "index(token) -> Find the index of the first match for token. Useful to find, for example, words that could also be a part of a larger word. For example, index('i') in 'string i' will be 7 not 3. Returns -1 if not found." + }, + + {NULL} /* Sentinel */ +}; + + +static PyTypeObject icu_BreakIteratorType = { // {{{ + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "icu.BreakIterator", /*tp_name*/ + sizeof(icu_BreakIterator), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)icu_BreakIterator_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "Break Iterator", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + icu_BreakIterator_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + icu_BreakIterator_new, /* tp_new */ +}; // }}} + +// }}} // change_case {{{ @@ -755,6 +951,28 @@ icu_roundtrip(PyObject *self, PyObject *args) { return ret; } // }}} +// available_locales_for_break_iterator {{{ +static PyObject * +icu_break_iterator_locales(PyObject *self, PyObject *args) { + int32_t count = 0, i = 0; + const char *loc = NULL; + PyObject *ret = NULL, *t = NULL; + + count = ubrk_countAvailable(); + ret = PyTuple_New(count); + if (ret != NULL) { + for (i = 0; i < count; i++) { + loc = ubrk_getAvailable(i); + if (!loc) loc = ""; + t = PyBytes_FromString(loc); + if (t == NULL) { Py_DECREF(ret); ret = NULL; PyErr_NoMemory(); break; } + PyTuple_SET_ITEM(ret, i, t); + } + } + + return ret; +} // }}} + // Module initialization {{{ static PyMethodDef icu_methods[] = { {"change_case", icu_change_case, METH_VARARGS, @@ -793,6 +1011,9 @@ static PyMethodDef icu_methods[] = { "roundtrip(string) -> Roundtrip a unicode object from python to ICU back to python (useful for testing)" }, + {"available_locales_for_break_iterator", icu_break_iterator_locales, METH_VARARGS, + "available_locales_for_break_iterator() -> Return tuple of all available locales for the BreakIterator" + }, {NULL} /* Sentinel */ }; @@ -824,12 +1045,15 @@ initicu(void) if (PyType_Ready(&icu_CollatorType) < 0) return; + if (PyType_Ready(&icu_BreakIteratorType) < 0) + return; m = Py_InitModule3("icu", icu_methods, "Wrapper for the ICU internationalization library"); - Py_INCREF(&icu_CollatorType); + Py_INCREF(&icu_CollatorType); Py_INCREF(&icu_BreakIteratorType); PyModule_AddObject(m, "Collator", (PyObject *)&icu_CollatorType); + PyModule_AddObject(m, "BreakIterator", (PyObject *)&icu_BreakIteratorType); // uint8_t must be the same size as char PyModule_AddIntConstant(m, "ok", (U_SUCCESS(status) && sizeof(uint8_t) == sizeof(char)) ? 1 : 0); PyModule_AddStringConstant(m, "icu_version", version); @@ -864,5 +1088,10 @@ initicu(void) ADDUCONST(LOWER_CASE); ADDUCONST(TITLE_CASE); + ADDUCONST(UBRK_CHARACTER); + ADDUCONST(UBRK_WORD); + ADDUCONST(UBRK_LINE); + ADDUCONST(UBRK_SENTENCE); + } // }}} diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h index a965d0c072..9dc8cd2bfc 100644 --- a/src/calibre/utils/icu_calibre_utils.h +++ b/src/calibre/utils/icu_calibre_utils.h @@ -20,6 +20,7 @@ #include #include #include +#include #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index a80851cdf1..4bbb264d5a 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -129,18 +129,34 @@ class TestICU(unittest.TestCase): {' ':[''], 'A':['A1', 'a1'], '\U0001f431':['\U0001f431', '\U0001f431x']}) def test_roundtrip(self): + ' Test roundtripping ' for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'): self.ae(r, icu._icu.roundtrip(r)) def test_character_name(self): + ' Test character naming ' self.ae(icu.character_name('\U0001f431'), 'CAT FACE') def test_contractions(self): + ' Test contractions ' c = icu._icu.Collator('cs') self.ae(icu.contractions(c), frozenset({u'Z\u030c', u'z\u030c', u'Ch', u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH', u'S\u030c', u'R\u030c'})) + def test_break_iterator(self): + ' Test the break iterator ' + from calibre.spell.break_iterator import split_into_words as split, index_of + for q in ('one two three', ' one two three', 'one\ntwo three ', 'one-two,three'): + self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) + self.ae(split(u'I I\'m'), ['I', "I'm"]) + self.ae(0, index_of('i', 'i')) + self.ae(4, index_of('i', 'six i')) + self.ae(-1, index_of('i', '')) + self.ae(-1, index_of('', '')) + self.ae(-1, index_of('', 'i')) + self.ae(-1, index_of('i', 'six clicks')) + class TestRunner(unittest.main): def createTests(self):