diff --git a/setup/extensions.py b/setup/extensions.py index 914ff102e8..321ce2da1a 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -95,6 +95,7 @@ extensions = [ Extension('icu', ['calibre/utils/icu.c'], + headers=['calibre/utils/icu_calibre_utils.h'], libraries=icu_libs, lib_dirs=icu_lib_dirs, inc_dirs=icu_inc_dirs, diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py index 5baa9d059f..618626883e 100644 --- a/src/calibre/test_build.py +++ b/src/calibre/test_build.py @@ -113,9 +113,10 @@ def test_ssl(): print ('SSL OK!') def test_icu(): - from calibre.utils.icu import _icu_not_ok + from calibre.utils.icu import _icu_not_ok, test_roundtrip if _icu_not_ok: raise RuntimeError('ICU module not loaded/valid') + test_roundtrip() print ('ICU OK!') def test_wpd(): diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 44edecf7d4..281f34cf57 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -1,16 +1,4 @@ -#define UNICODE -#define PY_SSIZE_T_CLEAN -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "icu_calibre_utils.h" static PyObject* uchar_to_unicode(const UChar *src, int32_t len) { wchar_t *buf = NULL; @@ -513,7 +501,6 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs) // }}} -// Module initialization {{{ // upper {{{ static PyObject * @@ -790,17 +777,12 @@ static PyObject * icu_normalize(PyObject *self, PyObject *args) { UErrorCode status = U_ZERO_ERROR; int32_t sz = 0, mode = UNORM_DEFAULT, cap = 0, rsz = 0; - char *buf = NULL, *utf8 = NULL; UChar *dest = NULL, *source = NULL; - PyObject *ret = NULL; + PyObject *ret = NULL, *src = NULL; - if (!PyArg_ParseTuple(args, "ies#", &mode, "UTF-8", &buf, &sz)) return NULL; - - cap = 2 * sz; - source = (UChar*) calloc(cap, sizeof(UChar)); - if (source == NULL) { PyErr_NoMemory(); goto end; } - u_strFromUTF8(source, cap, &sz, buf, sz, &status); - if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } cap = 2 * sz; + if (!PyArg_ParseTuple(args, "iO", &mode, &src)) return NULL; + source = python_to_icu(src, &sz, 1); + if (source == NULL) goto end; cap = 2 * sz; dest = (UChar*) calloc(cap, sizeof(UChar)); if (dest == NULL) { PyErr_NoMemory(); goto end; } @@ -820,23 +802,32 @@ icu_normalize(PyObject *self, PyObject *args) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } - - utf8 = (char*)calloc(rsz*5+1, sizeof(char)); - if (utf8 == NULL) {PyErr_NoMemory(); goto end;} - u_strToUTF8(utf8, rsz*5, &sz, dest, rsz, &status); - if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } - ret = PyUnicode_DecodeUTF8(utf8, sz, "replace"); - if (ret == NULL) PyErr_NoMemory(); + ret = icu_to_python(dest, rsz); end: - if (buf != NULL) PyMem_Free(buf); if (source != NULL) free(source); if (dest != NULL) free(dest); - if (utf8 != NULL) free(utf8); return ret; } // }}} +// roundtrip {{{ +static PyObject * +icu_roundtrip(PyObject *self, PyObject *args) { + int32_t sz = 0; + UChar *icu = NULL; + PyObject *ret = NULL, *src = NULL; + + if (!PyArg_ParseTuple(args, "O", &src)) return NULL; + icu = python_to_icu(src, &sz, 1); + if (icu != NULL) { + ret = icu_to_python(icu, sz); + free(icu); + } + return ret; +} // }}} + +// Module initialization {{{ static PyMethodDef icu_methods[] = { {"upper", icu_upper, METH_VARARGS, "upper(locale, unicode object) -> upper cased unicode object using locale rules." @@ -878,6 +869,11 @@ static PyMethodDef icu_methods[] = { "normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode." }, + {"roundtrip", icu_roundtrip, METH_VARARGS, + "roundtrip(string) -> Roundtrip a unicode object from python to ICU back to python (useful for testing)" + }, + + {NULL} /* Sentinel */ }; @@ -891,6 +887,11 @@ initicu(void) UErrorCode status = U_ZERO_ERROR; char version[U_MAX_VERSION_STRING_LENGTH+1] = {0}, uversion[U_MAX_VERSION_STRING_LENGTH+5] = {0}; + if (sizeof(Py_UNICODE) != 2 && sizeof(Py_UNICODE) != 4) { + PyErr_SetString(PyExc_RuntimeError, "This module only works on python versions <= 3.2"); + return; + } + u_init(&status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_RuntimeError, u_errorName(status)); diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 1f46fc87d0..b1e01768ba 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -169,6 +169,10 @@ def safe_chr(code): return py_safe_chr(code) def normalize(text, mode='NFC'): + # This is very slightly slower than using unicodedata.normalize, so stick with + # that unless you have very good reasons not too. Also, it's speed + # decreases on wide python builds, where conversion to/from ICU's string + # representation is slower. try: return _icu.normalize(_nmodes[mode], unicode(text)) except (AttributeError, KeyError): @@ -503,6 +507,28 @@ pĂȘchĂ©''' # }}} +def test_roundtrip(): + r = u'xxx\0\u2219\U0001f431xxx' + rp = _icu.roundtrip(r) + if rp != r: + raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp)) + +def test_normalize_performance(): + raw = open('t.txt', 'rb').read().decode('utf-8') + print (len(raw)) + import time, unicodedata + st = time.time() + count = 100 + for i in xrange(count): + normalize(raw) + print ('ICU time:', time.time() - st) + st = time.time() + for i in xrange(count): + unicodedata.normalize('NFC', unicode(raw)) + print ('py time:', time.time() - st) + if __name__ == '__main__': + test_roundtrip() + test_normalize_performance() test() diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h new file mode 100644 index 0000000000..cfc3f0015b --- /dev/null +++ b/src/calibre/utils/icu_calibre_utils.h @@ -0,0 +1,61 @@ +/* + * icu.h + * Copyright (C) 2014 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#pragma once + +#define UNICODE +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if PY_VERSION_HEX < 0x03030000 +// Roundtripping will need to be implemented differently for python > 3.2 where strings are stored with variable widths + +static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) { + UChar *ans = NULL; + Py_ssize_t sz = 0; + UErrorCode status = U_ZERO_ERROR; + + if (do_check && !PyUnicode_CheckExact(obj)) { + PyErr_SetString(PyExc_TypeError, "Not a unicode string"); + goto end; + } + + if (sizeof(Py_UNICODE) == 2) { // narrow build (UTF-16) + sz = PyUnicode_GET_DATA_SIZE(obj); + ans = (UChar*) calloc(sz, 1); + if (ans == NULL) { PyErr_NoMemory(); goto end; } + memcpy(ans, PyUnicode_AS_UNICODE(obj), sz); + if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj); + } else { // wide build (UCS 4) + sz = PyUnicode_GET_SIZE(obj); + ans = (UChar*) calloc(2*sz+1, sizeof(UChar)); // There can be no more than 2 UChars per character + if (ans == NULL) { PyErr_NoMemory(); goto end; } + u_strFromUTF32(ans, (int32_t)2*sz+1, osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status); + if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } + } +end: + return ans; +} + +static PyObject* icu_to_python(UChar *src, int32_t sz) { + if (sizeof(Py_UNICODE) == 2) // narrow build UTF-16 + return PyUnicode_FromUnicode((Py_UNICODE*)src, sz); + return PyUnicode_DecodeUTF16((char*)src, sz*sizeof(UChar), "strict", NULL); +} + +#endif +