mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix implementation of clean_xml_chars
Also speedup it up by implementing it in native code
This commit is contained in:
parent
61064892b0
commit
5a7b251025
@ -79,6 +79,8 @@ def test_plugins():
|
|||||||
print ('Loaded all plugins successfully!')
|
print ('Loaded all plugins successfully!')
|
||||||
|
|
||||||
def test_lxml():
|
def test_lxml():
|
||||||
|
from calibre.utils.cleantext import test_clean_xml_chars
|
||||||
|
test_clean_xml_chars()
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
raw = '<a/>'
|
raw = '<a/>'
|
||||||
root = etree.fromstring(raw)
|
root = etree.fromstring(raw)
|
||||||
|
@ -4,6 +4,17 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re, htmlentitydefs
|
import re, htmlentitydefs
|
||||||
from future_builtins import map
|
from future_builtins import map
|
||||||
|
from calibre.constants import plugins, preferred_encoding
|
||||||
|
|
||||||
|
try:
|
||||||
|
_ncxc = plugins['speedup'][0].clean_xml_chars
|
||||||
|
except AttributeError:
|
||||||
|
native_clean_xml_chars = None
|
||||||
|
else:
|
||||||
|
def native_clean_xml_chars(x):
|
||||||
|
if isinstance(x, bytes):
|
||||||
|
x = x.decode(preferred_encoding)
|
||||||
|
return _ncxc(x)
|
||||||
|
|
||||||
_ascii_pat = None
|
_ascii_pat = None
|
||||||
|
|
||||||
@ -32,9 +43,16 @@ def allowed(x):
|
|||||||
x = ord(x)
|
x = ord(x)
|
||||||
return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
|
return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
|
||||||
|
|
||||||
def clean_xml_chars(unicode_string):
|
def py_clean_xml_chars(unicode_string):
|
||||||
return u''.join(filter(allowed, unicode_string))
|
return u''.join(filter(allowed, unicode_string))
|
||||||
|
|
||||||
|
clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars
|
||||||
|
|
||||||
|
def test_clean_xml_chars():
|
||||||
|
raw = u'asd\x02a\U00010437x\ud801b\udffe\ud802'
|
||||||
|
if native_clean_xml_chars(raw) != u'asda\U00010437xb':
|
||||||
|
raise ValueError('Failed to XML clean: %r' % raw)
|
||||||
|
|
||||||
|
|
||||||
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
||||||
# Removes HTML or XML character references and entities from a text string.
|
# Removes HTML or XML character references and entities from a text string.
|
||||||
|
@ -316,6 +316,43 @@ error:
|
|||||||
return Py_BuildValue("NII", ans, state, codep);
|
return Py_BuildValue("NII", ans, state, codep);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
clean_xml_chars(PyObject *self, PyObject *text) {
|
||||||
|
#if PY_VERSION_HEX >= 0x03030000
|
||||||
|
#error Not implemented for python >= 3.3
|
||||||
|
#endif
|
||||||
|
Py_UNICODE *buf = NULL, ch;
|
||||||
|
PyUnicodeObject *ans = NULL;
|
||||||
|
Py_ssize_t i = 0, j = 0;
|
||||||
|
if (!PyUnicode_Check(text)) {
|
||||||
|
PyErr_SetString(PyExc_TypeError, "A unicode string is required");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ans = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, PyUnicode_GET_SIZE(text));
|
||||||
|
if (ans == NULL) return PyErr_NoMemory();
|
||||||
|
buf = ans->str;
|
||||||
|
|
||||||
|
for (; i < PyUnicode_GET_SIZE(text); i++) {
|
||||||
|
ch = PyUnicode_AS_UNICODE(text)[i];
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) || ch == 9 || ch == 10 || ch == 13 || (0xe000 <= ch && ch <= 0xfffd) || (0xffff < ch && ch <= 0x10ffff))
|
||||||
|
buf[j++] = ch;
|
||||||
|
#else
|
||||||
|
if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) || ch == 9 || ch == 10 || ch == 13 || (0xd000 <= ch && ch <= 0xfffd)) {
|
||||||
|
if (0xd800 <= ch && ch <= 0xdfff) {
|
||||||
|
// Test for valid surrogate pair
|
||||||
|
if (ch <= 0xdbff && i + 1 < PyUnicode_GET_SIZE(text) && 0xdc00 <= PyUnicode_AS_UNICODE(text)[i + 1] && PyUnicode_AS_UNICODE(text)[i+1] <= 0xdfff) {
|
||||||
|
buf[j++] = ch; buf[j++] = PyUnicode_AS_UNICODE(text)[++i];
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
buf[j++] = ch;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
ans->length = j;
|
||||||
|
return (PyObject*)ans;
|
||||||
|
}
|
||||||
|
|
||||||
static PyMethodDef speedup_methods[] = {
|
static PyMethodDef speedup_methods[] = {
|
||||||
{"parse_date", speedup_parse_date, METH_VARARGS,
|
{"parse_date", speedup_parse_date, METH_VARARGS,
|
||||||
"parse_date()\n\nParse ISO dates faster."
|
"parse_date()\n\nParse ISO dates faster."
|
||||||
@ -351,6 +388,10 @@ static PyMethodDef speedup_methods[] = {
|
|||||||
"utf8_decode(data, [, state=0, codep=0)\n\nDecode an UTF-8 bytestring, using a strict UTF-8 decoder, that unlike python does not allow orphaned surrogates. Returns a unicode object and the state."
|
"utf8_decode(data, [, state=0, codep=0)\n\nDecode an UTF-8 bytestring, using a strict UTF-8 decoder, that unlike python does not allow orphaned surrogates. Returns a unicode object and the state."
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{"clean_xml_chars", clean_xml_chars, METH_O,
|
||||||
|
"clean_xml_chars(unicode_object)\n\nRemove codepoints in unicode_object that are not allowed in XML"
|
||||||
|
},
|
||||||
|
|
||||||
{NULL, NULL, 0, NULL}
|
{NULL, NULL, 0, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user