From 5a7b2510254c72fdf233309082a911f98c3f52c1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Dec 2015 18:44:34 +0530 Subject: [PATCH] Fix implementation of clean_xml_chars Also speedup it up by implementing it in native code --- src/calibre/test_build.py | 2 ++ src/calibre/utils/cleantext.py | 20 ++++++++++++++++- src/calibre/utils/speedup.c | 41 ++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py index 55485c5ba2..29c1b848bd 100644 --- a/src/calibre/test_build.py +++ b/src/calibre/test_build.py @@ -79,6 +79,8 @@ def test_plugins(): print ('Loaded all plugins successfully!') def test_lxml(): + from calibre.utils.cleantext import test_clean_xml_chars + test_clean_xml_chars() from lxml import etree raw = '' root = etree.fromstring(raw) diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index c38aa18eda..5a68ff736b 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -4,6 +4,17 @@ __docformat__ = 'restructuredtext en' import re, htmlentitydefs from future_builtins import map +from calibre.constants import plugins, preferred_encoding + +try: + _ncxc = plugins['speedup'][0].clean_xml_chars +except AttributeError: + native_clean_xml_chars = None +else: + def native_clean_xml_chars(x): + if isinstance(x, bytes): + x = x.decode(preferred_encoding) + return _ncxc(x) _ascii_pat = None @@ -32,9 +43,16 @@ def allowed(x): x = ord(x) return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff) -def clean_xml_chars(unicode_string): +def py_clean_xml_chars(unicode_string): return u''.join(filter(allowed, unicode_string)) +clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars + +def test_clean_xml_chars(): + raw = u'asd\x02a\U00010437x\ud801b\udffe\ud802' + if native_clean_xml_chars(raw) != u'asda\U00010437xb': + raise ValueError('Failed to XML clean: %r' % raw) + # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html # Removes HTML or XML character references and entities from a text string. diff --git a/src/calibre/utils/speedup.c b/src/calibre/utils/speedup.c index 460c33a657..d989cb9f0d 100644 --- a/src/calibre/utils/speedup.c +++ b/src/calibre/utils/speedup.c @@ -316,6 +316,43 @@ error: return Py_BuildValue("NII", ans, state, codep); } +static PyObject* +clean_xml_chars(PyObject *self, PyObject *text) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + Py_UNICODE *buf = NULL, ch; + PyUnicodeObject *ans = NULL; + Py_ssize_t i = 0, j = 0; + if (!PyUnicode_Check(text)) { + PyErr_SetString(PyExc_TypeError, "A unicode string is required"); + return NULL; + } + ans = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, PyUnicode_GET_SIZE(text)); + if (ans == NULL) return PyErr_NoMemory(); + buf = ans->str; + + for (; i < PyUnicode_GET_SIZE(text); i++) { + ch = PyUnicode_AS_UNICODE(text)[i]; +#ifdef Py_UNICODE_WIDE + if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) || ch == 9 || ch == 10 || ch == 13 || (0xe000 <= ch && ch <= 0xfffd) || (0xffff < ch && ch <= 0x10ffff)) + buf[j++] = ch; +#else + if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) || ch == 9 || ch == 10 || ch == 13 || (0xd000 <= ch && ch <= 0xfffd)) { + if (0xd800 <= ch && ch <= 0xdfff) { + // Test for valid surrogate pair + if (ch <= 0xdbff && i + 1 < PyUnicode_GET_SIZE(text) && 0xdc00 <= PyUnicode_AS_UNICODE(text)[i + 1] && PyUnicode_AS_UNICODE(text)[i+1] <= 0xdfff) { + buf[j++] = ch; buf[j++] = PyUnicode_AS_UNICODE(text)[++i]; + } + } else + buf[j++] = ch; + } +#endif + } + ans->length = j; + return (PyObject*)ans; +} + static PyMethodDef speedup_methods[] = { {"parse_date", speedup_parse_date, METH_VARARGS, "parse_date()\n\nParse ISO dates faster." @@ -351,6 +388,10 @@ static PyMethodDef speedup_methods[] = { "utf8_decode(data, [, state=0, codep=0)\n\nDecode an UTF-8 bytestring, using a strict UTF-8 decoder, that unlike python does not allow orphaned surrogates. Returns a unicode object and the state." }, + {"clean_xml_chars", clean_xml_chars, METH_O, + "clean_xml_chars(unicode_object)\n\nRemove codepoints in unicode_object that are not allowed in XML" + }, + {NULL, NULL, 0, NULL} };