diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index b5f0ec0169..b5805bd4e1 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -255,6 +255,36 @@ icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) return Py_BuildValue("O", ans); } // }}} +// Collator.span_contractions {{{ +static PyObject * +icu_Collator_span_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) { + int span_type; + UErrorCode status = U_ZERO_ERROR; + PyObject *str; + size_t slen = 0; + wchar_t *buf; + UChar *s; + + if (!PyArg_ParseTuple(args, "Ui", &str, &span_type)) return NULL; + + if (self->contractions == NULL) { + self->contractions = uset_open(1, 0); + if (self->contractions == NULL) return PyErr_NoMemory(); + ucol_getContractionsAndExpansions(self->collator, self->contractions, NULL, 0, &status); + } + status = U_ZERO_ERROR; + + slen = PyUnicode_GetSize(str); + buf = (wchar_t*)calloc(slen*4 + 2, sizeof(wchar_t)); + s = (UChar*)calloc(slen*4 + 2, sizeof(UChar)); + if (buf == NULL || s == NULL) return PyErr_NoMemory(); + slen = PyUnicode_AsWideChar((PyUnicodeObject*)str, buf, slen); + u_strFromWCS(s, slen*4+1, NULL, buf, slen, &status); + + free(buf); free(s); + return Py_BuildValue("i", uset_span(self->contractions, s, slen, span_type)); +} // }}} + static PyMethodDef icu_Collator_methods[] = { {"sort_key", (PyCFunction)icu_Collator_sort_key, METH_VARARGS, "sort_key(unicode object) -> Return a sort key for the given object as a bytestring. The idea is that these bytestring will sort using the builtin cmp function, just like the original unicode strings would sort in the current locale with ICU." @@ -271,6 +301,11 @@ static PyMethodDef icu_Collator_methods[] = { {"contractions", (PyCFunction)icu_Collator_contractions, METH_VARARGS, "contractions() -> returns the contractions defined for this collator." }, + + {"span_contractions", (PyCFunction)icu_Collator_span_contractions, METH_VARARGS, + "span_contractions(src, span_condition) -> returns the length of the initial substring according to span_condition in the set of contractions for this collator. Returns 0 if src does not fit the span_condition. The span_condition can be one of USET_SPAN_NOT_CONTAINED, USET_SPAN_CONTAINED, USET_SPAN_SIMPLE." + }, + {NULL} /* Sentinel */ }; @@ -527,6 +562,9 @@ initicu(void) PyModule_AddObject(m, "Collator", (PyObject *)&icu_CollatorType); // uint8_t must be the same size as char PyModule_AddIntConstant(m, "ok", (U_SUCCESS(status) && sizeof(uint8_t) == sizeof(char)) ? 1 : 0); + PyModule_AddIntConstant(m, "USET_SPAN_NOT_CONTAINED", USET_SPAN_NOT_CONTAINED); + PyModule_AddIntConstant(m, "USET_SPAN_CONTAINED", USET_SPAN_CONTAINED); + PyModule_AddIntConstant(m, "USET_SPAN_SIMPLE", USET_SPAN_SIMPLE); } // }}} diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 95751d02b7..ead820f066 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -104,6 +104,20 @@ def icu_contractions(collator): _cmap[collator] = ans return ans +def py_span_contractions(*args, **kwargs): + return 0 + +def icu_span_contractions(src, span_type=None, collator=None): + global _collator + if collator is None: + collator = _collator + if span_type is None: + span_type = _icu.USET_SPAN_SIMPLE + try: + return collator.span_contractions(src, span_type) + except TypeError: + return collator.span_contractions(unicode(src), span_type) + load_icu() load_collator() _icu_not_ok = _icu is None or _collator is None @@ -144,6 +158,9 @@ find = (py_find if _icu_not_ok else partial(icu_find, _collator)) contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions, _collator))) +span_contractions = (py_span_contractions if _icu_not_ok else + icu_span_contractions) + ################################################################################ def test(): # {{{