mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Wrap the ICU BreakIterator API
This commit is contained in:
		
							parent
							
								
									8e4478cc6f
								
							
						
					
					
						commit
						0d4c179c4b
					
				
							
								
								
									
										34
									
								
								src/calibre/spell/break_iterator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								src/calibre/spell/break_iterator.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,34 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					# vim:fileencoding=utf-8
 | 
				
			||||||
 | 
					from __future__ import (unicode_literals, division, absolute_import,
 | 
				
			||||||
 | 
					                        print_function)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__license__ = 'GPL v3'
 | 
				
			||||||
 | 
					__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from threading import Lock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from calibre.utils.icu import _icu
 | 
				
			||||||
 | 
					from calibre.utils.localization import lang_as_iso639_1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_iterators = {}
 | 
				
			||||||
 | 
					_lock = Lock()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					has_break_iterator = hasattr(_icu, 'BreakIterator')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def split_into_words(text, lang='en'):
 | 
				
			||||||
 | 
					    with _lock:
 | 
				
			||||||
 | 
					        it = _iterators.get(lang, None)
 | 
				
			||||||
 | 
					        if it is None:
 | 
				
			||||||
 | 
					            it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
 | 
				
			||||||
 | 
					        it.set_text(text)
 | 
				
			||||||
 | 
					        return it.split()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def index_of(needle, haystack, lang='en'):
 | 
				
			||||||
 | 
					    with _lock:
 | 
				
			||||||
 | 
					        it = _iterators.get(lang, None)
 | 
				
			||||||
 | 
					        if it is None:
 | 
				
			||||||
 | 
					            it = _iterators[lang] = _icu.BreakIterator(_icu.UBRK_WORD, lang_as_iso639_1(lang) or lang)
 | 
				
			||||||
 | 
					        it.set_text(haystack)
 | 
				
			||||||
 | 
					        return it.index(needle)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -528,6 +528,202 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
// }}}
 | 
					// }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// BreakIterator object definition {{{
 | 
				
			||||||
 | 
					typedef struct {
 | 
				
			||||||
 | 
					    PyObject_HEAD
 | 
				
			||||||
 | 
					    // Type-specific fields go here.
 | 
				
			||||||
 | 
					    UBreakIterator *break_iterator;
 | 
				
			||||||
 | 
					    UChar *text;
 | 
				
			||||||
 | 
					    int32_t text_len;
 | 
				
			||||||
 | 
					    UBreakIteratorType type;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} icu_BreakIterator;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void
 | 
				
			||||||
 | 
					icu_BreakIterator_dealloc(icu_BreakIterator* self)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    if (self->break_iterator != NULL) ubrk_close(self->break_iterator);
 | 
				
			||||||
 | 
					    if (self->text != NULL) free(self->text);
 | 
				
			||||||
 | 
					    self->break_iterator = NULL; self->text = NULL;
 | 
				
			||||||
 | 
					    self->ob_type->tp_free((PyObject*)self);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static PyObject *
 | 
				
			||||||
 | 
					icu_BreakIterator_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    icu_BreakIterator *self = NULL;
 | 
				
			||||||
 | 
					    const char *locale = NULL;
 | 
				
			||||||
 | 
					    int break_iterator_type = UBRK_WORD;
 | 
				
			||||||
 | 
					    UErrorCode status = U_ZERO_ERROR;
 | 
				
			||||||
 | 
					    UBreakIterator *break_iterator;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (!PyArg_ParseTuple(args, "is", &break_iterator_type, &locale)) return NULL;
 | 
				
			||||||
 | 
					    break_iterator = ubrk_open(break_iterator_type, locale, NULL, 0, &status);
 | 
				
			||||||
 | 
					    if (break_iterator == NULL || U_FAILURE(status)) { 
 | 
				
			||||||
 | 
					        PyErr_SetString(PyExc_ValueError, u_errorName(status));
 | 
				
			||||||
 | 
					        return NULL;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    self = (icu_BreakIterator *)type->tp_alloc(type, 0);
 | 
				
			||||||
 | 
					    if (self != NULL) {
 | 
				
			||||||
 | 
					        self->break_iterator = break_iterator;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    self->text = NULL; self->text_len = 0; self->type = break_iterator_type;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return (PyObject *)self;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// BreakIterator.set_text {{{
 | 
				
			||||||
 | 
					static PyObject *
 | 
				
			||||||
 | 
					icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
 | 
				
			||||||
 | 
					    int32_t sz = 0;
 | 
				
			||||||
 | 
					    UChar *buf = NULL;
 | 
				
			||||||
 | 
					    UErrorCode status = U_ZERO_ERROR;
 | 
				
			||||||
 | 
					    PyObject *input = NULL;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    if (!PyArg_ParseTuple(args, "O", &input)) return NULL;
 | 
				
			||||||
 | 
					    buf = python_to_icu(input, &sz, 1);
 | 
				
			||||||
 | 
					    if (buf == NULL) return NULL;
 | 
				
			||||||
 | 
					    ubrk_setText(self->break_iterator, buf, sz, &status);
 | 
				
			||||||
 | 
					    if (U_FAILURE(status)) {
 | 
				
			||||||
 | 
					        free(buf); buf = NULL;
 | 
				
			||||||
 | 
					        PyErr_SetString(PyExc_ValueError, u_errorName(status));
 | 
				
			||||||
 | 
					    } else { self->text = buf; self->text_len = sz; }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Py_RETURN_NONE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} // }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// BreakIterator.split {{{
 | 
				
			||||||
 | 
					static PyObject *
 | 
				
			||||||
 | 
					icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
 | 
				
			||||||
 | 
					    int32_t prev = 0, p = 0, sz = 0;
 | 
				
			||||||
 | 
					    PyObject *ans = NULL, *token = NULL;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    ans = PyList_New(0);
 | 
				
			||||||
 | 
					    if (ans == NULL) return PyErr_NoMemory();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    p = ubrk_first(self->break_iterator);
 | 
				
			||||||
 | 
					    while (p != UBRK_DONE) {
 | 
				
			||||||
 | 
					        prev = p; p = ubrk_next(self->break_iterator);
 | 
				
			||||||
 | 
					        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
 | 
				
			||||||
 | 
					            continue;  // We are not at the start of a word
 | 
				
			||||||
 | 
					        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
 | 
				
			||||||
 | 
					        if (sz > 0) {
 | 
				
			||||||
 | 
					            token = icu_to_python(self->text + prev, sz);
 | 
				
			||||||
 | 
					            if (token == NULL) {
 | 
				
			||||||
 | 
					                Py_DECREF(ans); ans = NULL; break; 
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            if (PyList_Append(ans, token) != 0) {
 | 
				
			||||||
 | 
					                Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; 
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            Py_DECREF(token);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return ans;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} // }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// BreakIterator.index {{{
 | 
				
			||||||
 | 
					static PyObject *
 | 
				
			||||||
 | 
					icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
 | 
				
			||||||
 | 
					#if PY_VERSION_HEX >= 0x03030000 
 | 
				
			||||||
 | 
					#error Not implemented for python >= 3.3
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    UChar *buf = NULL;
 | 
				
			||||||
 | 
					    int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1;
 | 
				
			||||||
 | 
					    PyObject *token = NULL;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    if (!PyArg_ParseTuple(args, "O", &token)) return NULL;
 | 
				
			||||||
 | 
					    buf = python_to_icu(token, &sz, 1);
 | 
				
			||||||
 | 
					    if (buf == NULL) return NULL;
 | 
				
			||||||
 | 
					    if (sz < 1) goto end;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    p = ubrk_first(self->break_iterator);
 | 
				
			||||||
 | 
					    while (p != UBRK_DONE) {
 | 
				
			||||||
 | 
					        prev = p; p = ubrk_next(self->break_iterator);
 | 
				
			||||||
 | 
					        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
 | 
				
			||||||
 | 
					            continue;  // We are not at the start of a word
 | 
				
			||||||
 | 
					        tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
 | 
				
			||||||
 | 
					        if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { 
 | 
				
			||||||
 | 
					#ifdef PY_UNICODE_WIDE
 | 
				
			||||||
 | 
					            ans = u_countChar32(self->text, prev);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					            ans = prev; 
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					            break;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					end:
 | 
				
			||||||
 | 
					    free(buf);
 | 
				
			||||||
 | 
					    return Py_BuildValue("i", ans);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} // }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static PyMethodDef icu_BreakIterator_methods[] = {
 | 
				
			||||||
 | 
					    {"set_text", (PyCFunction)icu_BreakIterator_set_text, METH_VARARGS,
 | 
				
			||||||
 | 
					     "set_text(unicode object) -> Set the text this iterator will operate on"
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    {"split", (PyCFunction)icu_BreakIterator_split, METH_VARARGS,
 | 
				
			||||||
 | 
					     "split() -> Split the current text into tokens, returning a list of tokens"
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    {"index", (PyCFunction)icu_BreakIterator_index, METH_VARARGS,
 | 
				
			||||||
 | 
					     "index(token) -> Find the index of the first match for token. Useful to find, for example, words that could also be a part of a larger word. For example, index('i') in 'string i' will be 7 not 3. Returns -1 if not found."
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    {NULL}  /* Sentinel */
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static PyTypeObject icu_BreakIteratorType = { // {{{
 | 
				
			||||||
 | 
					    PyObject_HEAD_INIT(NULL)
 | 
				
			||||||
 | 
					    0,                         /*ob_size*/
 | 
				
			||||||
 | 
					    "icu.BreakIterator",            /*tp_name*/
 | 
				
			||||||
 | 
					    sizeof(icu_BreakIterator),      /*tp_basicsize*/
 | 
				
			||||||
 | 
					    0,                         /*tp_itemsize*/
 | 
				
			||||||
 | 
					    (destructor)icu_BreakIterator_dealloc, /*tp_dealloc*/
 | 
				
			||||||
 | 
					    0,                         /*tp_print*/
 | 
				
			||||||
 | 
					    0,                         /*tp_getattr*/
 | 
				
			||||||
 | 
					    0,                         /*tp_setattr*/
 | 
				
			||||||
 | 
					    0,                         /*tp_compare*/
 | 
				
			||||||
 | 
					    0,                         /*tp_repr*/
 | 
				
			||||||
 | 
					    0,                         /*tp_as_number*/
 | 
				
			||||||
 | 
					    0,                         /*tp_as_sequence*/
 | 
				
			||||||
 | 
					    0,                         /*tp_as_mapping*/
 | 
				
			||||||
 | 
					    0,                         /*tp_hash */
 | 
				
			||||||
 | 
					    0,                         /*tp_call*/
 | 
				
			||||||
 | 
					    0,                         /*tp_str*/
 | 
				
			||||||
 | 
					    0,                         /*tp_getattro*/
 | 
				
			||||||
 | 
					    0,                         /*tp_setattro*/
 | 
				
			||||||
 | 
					    0,                         /*tp_as_buffer*/
 | 
				
			||||||
 | 
					    Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,        /*tp_flags*/
 | 
				
			||||||
 | 
					    "Break Iterator",                  /* tp_doc */
 | 
				
			||||||
 | 
					    0,		               /* tp_traverse */
 | 
				
			||||||
 | 
					    0,		               /* tp_clear */
 | 
				
			||||||
 | 
					    0,		               /* tp_richcompare */
 | 
				
			||||||
 | 
					    0,		               /* tp_weaklistoffset */
 | 
				
			||||||
 | 
					    0,		               /* tp_iter */
 | 
				
			||||||
 | 
					    0,		               /* tp_iternext */
 | 
				
			||||||
 | 
					    icu_BreakIterator_methods,             /* tp_methods */
 | 
				
			||||||
 | 
					    0,             /* tp_members */
 | 
				
			||||||
 | 
					    0,                         /* tp_getset */
 | 
				
			||||||
 | 
					    0,                         /* tp_base */
 | 
				
			||||||
 | 
					    0,                         /* tp_dict */
 | 
				
			||||||
 | 
					    0,                         /* tp_descr_get */
 | 
				
			||||||
 | 
					    0,                         /* tp_descr_set */
 | 
				
			||||||
 | 
					    0,                         /* tp_dictoffset */
 | 
				
			||||||
 | 
					    0,      /* tp_init */
 | 
				
			||||||
 | 
					    0,                         /* tp_alloc */
 | 
				
			||||||
 | 
					    icu_BreakIterator_new,                 /* tp_new */
 | 
				
			||||||
 | 
					}; // }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// change_case {{{
 | 
					// change_case {{{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -755,6 +951,28 @@ icu_roundtrip(PyObject *self, PyObject *args) {
 | 
				
			|||||||
    return ret;
 | 
					    return ret;
 | 
				
			||||||
} // }}}
 | 
					} // }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// available_locales_for_break_iterator {{{
 | 
				
			||||||
 | 
					static PyObject *
 | 
				
			||||||
 | 
					icu_break_iterator_locales(PyObject *self, PyObject *args) {
 | 
				
			||||||
 | 
					    int32_t count = 0, i = 0;
 | 
				
			||||||
 | 
					    const char *loc = NULL;
 | 
				
			||||||
 | 
					    PyObject *ret = NULL, *t = NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    count = ubrk_countAvailable();
 | 
				
			||||||
 | 
					    ret = PyTuple_New(count);
 | 
				
			||||||
 | 
					    if (ret != NULL) {
 | 
				
			||||||
 | 
					        for (i = 0; i < count; i++) {
 | 
				
			||||||
 | 
					            loc = ubrk_getAvailable(i);
 | 
				
			||||||
 | 
					            if (!loc) loc = "";
 | 
				
			||||||
 | 
					            t = PyBytes_FromString(loc);
 | 
				
			||||||
 | 
					            if (t == NULL) { Py_DECREF(ret); ret = NULL; PyErr_NoMemory(); break; }
 | 
				
			||||||
 | 
					            PyTuple_SET_ITEM(ret, i, t);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    return ret;
 | 
				
			||||||
 | 
					} // }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Module initialization {{{
 | 
					// Module initialization {{{
 | 
				
			||||||
static PyMethodDef icu_methods[] = {
 | 
					static PyMethodDef icu_methods[] = {
 | 
				
			||||||
    {"change_case", icu_change_case, METH_VARARGS,
 | 
					    {"change_case", icu_change_case, METH_VARARGS,
 | 
				
			||||||
@ -793,6 +1011,9 @@ static PyMethodDef icu_methods[] = {
 | 
				
			|||||||
     "roundtrip(string) -> Roundtrip a unicode object from python to ICU back to python (useful for testing)"
 | 
					     "roundtrip(string) -> Roundtrip a unicode object from python to ICU back to python (useful for testing)"
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    {"available_locales_for_break_iterator", icu_break_iterator_locales, METH_VARARGS, 
 | 
				
			||||||
 | 
					     "available_locales_for_break_iterator() -> Return tuple of all available locales for the BreakIterator"
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    {NULL}  /* Sentinel */
 | 
					    {NULL}  /* Sentinel */
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
@ -824,12 +1045,15 @@ initicu(void)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    if (PyType_Ready(&icu_CollatorType) < 0)
 | 
					    if (PyType_Ready(&icu_CollatorType) < 0)
 | 
				
			||||||
        return;
 | 
					        return;
 | 
				
			||||||
 | 
					    if (PyType_Ready(&icu_BreakIteratorType) < 0)
 | 
				
			||||||
 | 
					        return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    m = Py_InitModule3("icu", icu_methods,
 | 
					    m = Py_InitModule3("icu", icu_methods,
 | 
				
			||||||
                       "Wrapper for the ICU internationalization library");
 | 
					                       "Wrapper for the ICU internationalization library");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Py_INCREF(&icu_CollatorType);
 | 
					    Py_INCREF(&icu_CollatorType); Py_INCREF(&icu_BreakIteratorType);
 | 
				
			||||||
    PyModule_AddObject(m, "Collator", (PyObject *)&icu_CollatorType);
 | 
					    PyModule_AddObject(m, "Collator", (PyObject *)&icu_CollatorType);
 | 
				
			||||||
 | 
					    PyModule_AddObject(m, "BreakIterator", (PyObject *)&icu_BreakIteratorType);
 | 
				
			||||||
    // uint8_t must be the same size as char
 | 
					    // uint8_t must be the same size as char
 | 
				
			||||||
    PyModule_AddIntConstant(m, "ok", (U_SUCCESS(status) && sizeof(uint8_t) == sizeof(char)) ? 1 : 0);
 | 
					    PyModule_AddIntConstant(m, "ok", (U_SUCCESS(status) && sizeof(uint8_t) == sizeof(char)) ? 1 : 0);
 | 
				
			||||||
    PyModule_AddStringConstant(m, "icu_version", version);
 | 
					    PyModule_AddStringConstant(m, "icu_version", version);
 | 
				
			||||||
@ -864,5 +1088,10 @@ initicu(void)
 | 
				
			|||||||
    ADDUCONST(LOWER_CASE);
 | 
					    ADDUCONST(LOWER_CASE);
 | 
				
			||||||
    ADDUCONST(TITLE_CASE);
 | 
					    ADDUCONST(TITLE_CASE);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ADDUCONST(UBRK_CHARACTER);
 | 
				
			||||||
 | 
					    ADDUCONST(UBRK_WORD);
 | 
				
			||||||
 | 
					    ADDUCONST(UBRK_LINE);
 | 
				
			||||||
 | 
					    ADDUCONST(UBRK_SENTENCE);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
// }}}
 | 
					// }}}
 | 
				
			||||||
 | 
				
			|||||||
@ -20,6 +20,7 @@
 | 
				
			|||||||
#include <unicode/usearch.h>
 | 
					#include <unicode/usearch.h>
 | 
				
			||||||
#include <unicode/utrans.h>
 | 
					#include <unicode/utrans.h>
 | 
				
			||||||
#include <unicode/unorm.h>
 | 
					#include <unicode/unorm.h>
 | 
				
			||||||
 | 
					#include <unicode/ubrk.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if PY_VERSION_HEX >= 0x03030000 
 | 
					#if PY_VERSION_HEX >= 0x03030000 
 | 
				
			||||||
#error Not implemented for python >= 3.3
 | 
					#error Not implemented for python >= 3.3
 | 
				
			||||||
 | 
				
			|||||||
@ -129,18 +129,34 @@ class TestICU(unittest.TestCase):
 | 
				
			|||||||
                {' ':[''], 'A':['A1', 'a1'], '\U0001f431':['\U0001f431', '\U0001f431x']})
 | 
					                {' ':[''], 'A':['A1', 'a1'], '\U0001f431':['\U0001f431', '\U0001f431x']})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_roundtrip(self):
 | 
					    def test_roundtrip(self):
 | 
				
			||||||
 | 
					        ' Test roundtripping '
 | 
				
			||||||
        for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
 | 
					        for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
 | 
				
			||||||
            self.ae(r, icu._icu.roundtrip(r))
 | 
					            self.ae(r, icu._icu.roundtrip(r))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_character_name(self):
 | 
					    def test_character_name(self):
 | 
				
			||||||
 | 
					        ' Test character naming '
 | 
				
			||||||
        self.ae(icu.character_name('\U0001f431'), 'CAT FACE')
 | 
					        self.ae(icu.character_name('\U0001f431'), 'CAT FACE')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_contractions(self):
 | 
					    def test_contractions(self):
 | 
				
			||||||
 | 
					        ' Test contractions '
 | 
				
			||||||
        c = icu._icu.Collator('cs')
 | 
					        c = icu._icu.Collator('cs')
 | 
				
			||||||
        self.ae(icu.contractions(c), frozenset({u'Z\u030c', u'z\u030c', u'Ch',
 | 
					        self.ae(icu.contractions(c), frozenset({u'Z\u030c', u'z\u030c', u'Ch',
 | 
				
			||||||
            u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
 | 
					            u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
 | 
				
			||||||
            u'S\u030c', u'R\u030c'}))
 | 
					            u'S\u030c', u'R\u030c'}))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_break_iterator(self):
 | 
				
			||||||
 | 
					        ' Test the break iterator '
 | 
				
			||||||
 | 
					        from calibre.spell.break_iterator import split_into_words as split, index_of
 | 
				
			||||||
 | 
					        for q in ('one two three', ' one two three', 'one\ntwo  three ', 'one-two,three'):
 | 
				
			||||||
 | 
					            self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
 | 
				
			||||||
 | 
					        self.ae(split(u'I I\'m'), ['I', "I'm"])
 | 
				
			||||||
 | 
					        self.ae(0, index_of('i', 'i'))
 | 
				
			||||||
 | 
					        self.ae(4, index_of('i', 'six i'))
 | 
				
			||||||
 | 
					        self.ae(-1, index_of('i', ''))
 | 
				
			||||||
 | 
					        self.ae(-1, index_of('', ''))
 | 
				
			||||||
 | 
					        self.ae(-1, index_of('', 'i'))
 | 
				
			||||||
 | 
					        self.ae(-1, index_of('i', 'six clicks'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestRunner(unittest.main):
 | 
					class TestRunner(unittest.main):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def createTests(self):
 | 
					    def createTests(self):
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user