mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Faster roundtripping of strings between ICU and python
This commit is contained in:
parent
e05c4e669b
commit
df6a06c8b7
@ -95,6 +95,7 @@ extensions = [
|
|||||||
|
|
||||||
Extension('icu',
|
Extension('icu',
|
||||||
['calibre/utils/icu.c'],
|
['calibre/utils/icu.c'],
|
||||||
|
headers=['calibre/utils/icu_calibre_utils.h'],
|
||||||
libraries=icu_libs,
|
libraries=icu_libs,
|
||||||
lib_dirs=icu_lib_dirs,
|
lib_dirs=icu_lib_dirs,
|
||||||
inc_dirs=icu_inc_dirs,
|
inc_dirs=icu_inc_dirs,
|
||||||
|
@ -113,9 +113,10 @@ def test_ssl():
|
|||||||
print ('SSL OK!')
|
print ('SSL OK!')
|
||||||
|
|
||||||
def test_icu():
|
def test_icu():
|
||||||
from calibre.utils.icu import _icu_not_ok
|
from calibre.utils.icu import _icu_not_ok, test_roundtrip
|
||||||
if _icu_not_ok:
|
if _icu_not_ok:
|
||||||
raise RuntimeError('ICU module not loaded/valid')
|
raise RuntimeError('ICU module not loaded/valid')
|
||||||
|
test_roundtrip()
|
||||||
print ('ICU OK!')
|
print ('ICU OK!')
|
||||||
|
|
||||||
def test_wpd():
|
def test_wpd():
|
||||||
|
@ -1,16 +1,4 @@
|
|||||||
#define UNICODE
|
#include "icu_calibre_utils.h"
|
||||||
#define PY_SSIZE_T_CLEAN
|
|
||||||
#include <Python.h>
|
|
||||||
#include <unicode/uversion.h>
|
|
||||||
#include <unicode/utypes.h>
|
|
||||||
#include <unicode/uclean.h>
|
|
||||||
#include <unicode/utf16.h>
|
|
||||||
#include <unicode/ucol.h>
|
|
||||||
#include <unicode/ucoleitr.h>
|
|
||||||
#include <unicode/ustring.h>
|
|
||||||
#include <unicode/usearch.h>
|
|
||||||
#include <unicode/utrans.h>
|
|
||||||
#include <unicode/unorm.h>
|
|
||||||
|
|
||||||
static PyObject* uchar_to_unicode(const UChar *src, int32_t len) {
|
static PyObject* uchar_to_unicode(const UChar *src, int32_t len) {
|
||||||
wchar_t *buf = NULL;
|
wchar_t *buf = NULL;
|
||||||
@ -513,7 +501,6 @@ icu_Collator_clone(icu_Collator *self, PyObject *args, PyObject *kwargs)
|
|||||||
|
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// Module initialization {{{
|
|
||||||
|
|
||||||
// upper {{{
|
// upper {{{
|
||||||
static PyObject *
|
static PyObject *
|
||||||
@ -790,17 +777,12 @@ static PyObject *
|
|||||||
icu_normalize(PyObject *self, PyObject *args) {
|
icu_normalize(PyObject *self, PyObject *args) {
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
int32_t sz = 0, mode = UNORM_DEFAULT, cap = 0, rsz = 0;
|
int32_t sz = 0, mode = UNORM_DEFAULT, cap = 0, rsz = 0;
|
||||||
char *buf = NULL, *utf8 = NULL;
|
|
||||||
UChar *dest = NULL, *source = NULL;
|
UChar *dest = NULL, *source = NULL;
|
||||||
PyObject *ret = NULL;
|
PyObject *ret = NULL, *src = NULL;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "ies#", &mode, "UTF-8", &buf, &sz)) return NULL;
|
if (!PyArg_ParseTuple(args, "iO", &mode, &src)) return NULL;
|
||||||
|
source = python_to_icu(src, &sz, 1);
|
||||||
cap = 2 * sz;
|
if (source == NULL) goto end;
|
||||||
source = (UChar*) calloc(cap, sizeof(UChar));
|
|
||||||
if (source == NULL) { PyErr_NoMemory(); goto end; }
|
|
||||||
u_strFromUTF8(source, cap, &sz, buf, sz, &status);
|
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } cap = 2 * sz;
|
|
||||||
cap = 2 * sz;
|
cap = 2 * sz;
|
||||||
dest = (UChar*) calloc(cap, sizeof(UChar));
|
dest = (UChar*) calloc(cap, sizeof(UChar));
|
||||||
if (dest == NULL) { PyErr_NoMemory(); goto end; }
|
if (dest == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
@ -820,23 +802,32 @@ icu_normalize(PyObject *self, PyObject *args) {
|
|||||||
PyErr_SetString(PyExc_ValueError, u_errorName(status));
|
PyErr_SetString(PyExc_ValueError, u_errorName(status));
|
||||||
goto end;
|
goto end;
|
||||||
}
|
}
|
||||||
|
|
||||||
utf8 = (char*)calloc(rsz*5+1, sizeof(char));
|
|
||||||
if (utf8 == NULL) {PyErr_NoMemory(); goto end;}
|
|
||||||
u_strToUTF8(utf8, rsz*5, &sz, dest, rsz, &status);
|
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
|
|
||||||
|
|
||||||
ret = PyUnicode_DecodeUTF8(utf8, sz, "replace");
|
ret = icu_to_python(dest, rsz);
|
||||||
if (ret == NULL) PyErr_NoMemory();
|
|
||||||
|
|
||||||
end:
|
end:
|
||||||
if (buf != NULL) PyMem_Free(buf);
|
|
||||||
if (source != NULL) free(source);
|
if (source != NULL) free(source);
|
||||||
if (dest != NULL) free(dest);
|
if (dest != NULL) free(dest);
|
||||||
if (utf8 != NULL) free(utf8);
|
|
||||||
return ret;
|
return ret;
|
||||||
} // }}}
|
} // }}}
|
||||||
|
|
||||||
|
// roundtrip {{{
|
||||||
|
static PyObject *
|
||||||
|
icu_roundtrip(PyObject *self, PyObject *args) {
|
||||||
|
int32_t sz = 0;
|
||||||
|
UChar *icu = NULL;
|
||||||
|
PyObject *ret = NULL, *src = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "O", &src)) return NULL;
|
||||||
|
icu = python_to_icu(src, &sz, 1);
|
||||||
|
if (icu != NULL) {
|
||||||
|
ret = icu_to_python(icu, sz);
|
||||||
|
free(icu);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
} // }}}
|
||||||
|
|
||||||
|
// Module initialization {{{
|
||||||
static PyMethodDef icu_methods[] = {
|
static PyMethodDef icu_methods[] = {
|
||||||
{"upper", icu_upper, METH_VARARGS,
|
{"upper", icu_upper, METH_VARARGS,
|
||||||
"upper(locale, unicode object) -> upper cased unicode object using locale rules."
|
"upper(locale, unicode object) -> upper cased unicode object using locale rules."
|
||||||
@ -878,6 +869,11 @@ static PyMethodDef icu_methods[] = {
|
|||||||
"normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
|
"normalize(mode, unicode_text) -> Return a python unicode string which is normalized in the specified mode."
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{"roundtrip", icu_roundtrip, METH_VARARGS,
|
||||||
|
"roundtrip(string) -> Roundtrip a unicode object from python to ICU back to python (useful for testing)"
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
{NULL} /* Sentinel */
|
{NULL} /* Sentinel */
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -891,6 +887,11 @@ initicu(void)
|
|||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
char version[U_MAX_VERSION_STRING_LENGTH+1] = {0}, uversion[U_MAX_VERSION_STRING_LENGTH+5] = {0};
|
char version[U_MAX_VERSION_STRING_LENGTH+1] = {0}, uversion[U_MAX_VERSION_STRING_LENGTH+5] = {0};
|
||||||
|
|
||||||
|
if (sizeof(Py_UNICODE) != 2 && sizeof(Py_UNICODE) != 4) {
|
||||||
|
PyErr_SetString(PyExc_RuntimeError, "This module only works on python versions <= 3.2");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
u_init(&status);
|
u_init(&status);
|
||||||
if (U_FAILURE(status)) {
|
if (U_FAILURE(status)) {
|
||||||
PyErr_SetString(PyExc_RuntimeError, u_errorName(status));
|
PyErr_SetString(PyExc_RuntimeError, u_errorName(status));
|
||||||
|
@ -169,6 +169,10 @@ def safe_chr(code):
|
|||||||
return py_safe_chr(code)
|
return py_safe_chr(code)
|
||||||
|
|
||||||
def normalize(text, mode='NFC'):
|
def normalize(text, mode='NFC'):
|
||||||
|
# This is very slightly slower than using unicodedata.normalize, so stick with
|
||||||
|
# that unless you have very good reasons not too. Also, it's speed
|
||||||
|
# decreases on wide python builds, where conversion to/from ICU's string
|
||||||
|
# representation is slower.
|
||||||
try:
|
try:
|
||||||
return _icu.normalize(_nmodes[mode], unicode(text))
|
return _icu.normalize(_nmodes[mode], unicode(text))
|
||||||
except (AttributeError, KeyError):
|
except (AttributeError, KeyError):
|
||||||
@ -503,6 +507,28 @@ pêché'''
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
def test_roundtrip():
|
||||||
|
r = u'xxx\0\u2219\U0001f431xxx'
|
||||||
|
rp = _icu.roundtrip(r)
|
||||||
|
if rp != r:
|
||||||
|
raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
|
||||||
|
|
||||||
|
def test_normalize_performance():
|
||||||
|
raw = open('t.txt', 'rb').read().decode('utf-8')
|
||||||
|
print (len(raw))
|
||||||
|
import time, unicodedata
|
||||||
|
st = time.time()
|
||||||
|
count = 100
|
||||||
|
for i in xrange(count):
|
||||||
|
normalize(raw)
|
||||||
|
print ('ICU time:', time.time() - st)
|
||||||
|
st = time.time()
|
||||||
|
for i in xrange(count):
|
||||||
|
unicodedata.normalize('NFC', unicode(raw))
|
||||||
|
print ('py time:', time.time() - st)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
test_roundtrip()
|
||||||
|
test_normalize_performance()
|
||||||
test()
|
test()
|
||||||
|
|
||||||
|
61
src/calibre/utils/icu_calibre_utils.h
Normal file
61
src/calibre/utils/icu_calibre_utils.h
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
/*
|
||||||
|
* icu.h
|
||||||
|
* Copyright (C) 2014 Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
*
|
||||||
|
* Distributed under terms of the GPL3 license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#define UNICODE
|
||||||
|
#define PY_SSIZE_T_CLEAN
|
||||||
|
#include <Python.h>
|
||||||
|
#include <unicode/uversion.h>
|
||||||
|
#include <unicode/utypes.h>
|
||||||
|
#include <unicode/uclean.h>
|
||||||
|
#include <unicode/utf16.h>
|
||||||
|
#include <unicode/ucol.h>
|
||||||
|
#include <unicode/ucoleitr.h>
|
||||||
|
#include <unicode/ustring.h>
|
||||||
|
#include <unicode/usearch.h>
|
||||||
|
#include <unicode/utrans.h>
|
||||||
|
#include <unicode/unorm.h>
|
||||||
|
|
||||||
|
#if PY_VERSION_HEX < 0x03030000
|
||||||
|
// Roundtripping will need to be implemented differently for python > 3.2 where strings are stored with variable widths
|
||||||
|
|
||||||
|
static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
||||||
|
UChar *ans = NULL;
|
||||||
|
Py_ssize_t sz = 0;
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
|
||||||
|
if (do_check && !PyUnicode_CheckExact(obj)) {
|
||||||
|
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
|
||||||
|
goto end;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sizeof(Py_UNICODE) == 2) { // narrow build (UTF-16)
|
||||||
|
sz = PyUnicode_GET_DATA_SIZE(obj);
|
||||||
|
ans = (UChar*) calloc(sz, 1);
|
||||||
|
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
memcpy(ans, PyUnicode_AS_UNICODE(obj), sz);
|
||||||
|
if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
|
||||||
|
} else { // wide build (UCS 4)
|
||||||
|
sz = PyUnicode_GET_SIZE(obj);
|
||||||
|
ans = (UChar*) calloc(2*sz+1, sizeof(UChar)); // There can be no more than 2 UChars per character
|
||||||
|
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
|
u_strFromUTF32(ans, (int32_t)2*sz+1, osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status);
|
||||||
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
|
||||||
|
}
|
||||||
|
end:
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject* icu_to_python(UChar *src, int32_t sz) {
|
||||||
|
if (sizeof(Py_UNICODE) == 2) // narrow build UTF-16
|
||||||
|
return PyUnicode_FromUnicode((Py_UNICODE*)src, sz);
|
||||||
|
return PyUnicode_DecodeUTF16((char*)src, sz*sizeof(UChar), "strict", NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user