diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 7d1e9871c9..77ebb53c23 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -975,6 +975,25 @@ icu_break_iterator_locales(PyObject *self, PyObject *args) { return ret; } // }}} +// string_length {{{ +static PyObject * +icu_string_length(PyObject *self, PyObject *args) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + + int32_t sz = 0; + UChar *icu = NULL; + PyObject *src = NULL; + + if (!PyArg_ParseTuple(args, "O", &src)) return NULL; + icu = python_to_icu(src, &sz, 1); + if (icu == NULL) return NULL; + sz = u_countChar32(icu, sz); + free(icu); + return Py_BuildValue("i", sz); +} // }}} + // Module initialization {{{ static PyMethodDef icu_methods[] = { {"change_case", icu_change_case, METH_VARARGS, @@ -1017,6 +1036,10 @@ static PyMethodDef icu_methods[] = { "available_locales_for_break_iterator() -> Return tuple of all available locales for the BreakIterator" }, + {"string_length", icu_string_length, METH_VARARGS, + "string_length(string) -> Return the length of a string (number of unicode code points in the string). Useful on anrrow python builds where len() returns an incorrect answer if the string contains surrogate pairs." + }, + {NULL} /* Sentinel */ }; diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index e062f9ef00..eafd0768e0 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -278,6 +278,12 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x): ans[last_c] = [item] return ans +# Return the number of unicode codepoints in a string +try: + string_length = _icu.string_length if is_narrow_build else len +except AttributeError: + string_length = len # Somebody running from source with a binary that has not been updated + ################################################################################ if __name__ == '__main__': diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 4bbb264d5a..2b6572c35f 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -132,6 +132,8 @@ class TestICU(unittest.TestCase): ' Test roundtripping ' for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'): self.ae(r, icu._icu.roundtrip(r)) + for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]: + self.ae(icu._icu.string_length(x), l) def test_character_name(self): ' Test character naming '