From 5a7b2510254c72fdf233309082a911f98c3f52c1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 19 Dec 2015 18:44:34 +0530
Subject: [PATCH] Fix implementation of clean_xml_chars

Also speedup it up by implementing it in native code
---
 src/calibre/test_build.py      |  2 ++
 src/calibre/utils/cleantext.py | 20 ++++++++++++++++-
 src/calibre/utils/speedup.c    | 41 ++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py
index 55485c5ba2..29c1b848bd 100644
--- a/src/calibre/test_build.py
+++ b/src/calibre/test_build.py
@@ -79,6 +79,8 @@ def test_plugins():
     print ('Loaded all plugins successfully!')
 
 def test_lxml():
+    from calibre.utils.cleantext import test_clean_xml_chars
+    test_clean_xml_chars()
     from lxml import etree
     raw = '<a/>'
     root = etree.fromstring(raw)
diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py
index c38aa18eda..5a68ff736b 100644
--- a/src/calibre/utils/cleantext.py
+++ b/src/calibre/utils/cleantext.py
@@ -4,6 +4,17 @@ __docformat__ = 'restructuredtext en'
 
 import re, htmlentitydefs
 from future_builtins import map
+from calibre.constants import plugins, preferred_encoding
+
+try:
+    _ncxc = plugins['speedup'][0].clean_xml_chars
+except AttributeError:
+    native_clean_xml_chars = None
+else:
+    def native_clean_xml_chars(x):
+        if isinstance(x, bytes):
+            x = x.decode(preferred_encoding)
+        return _ncxc(x)
 
 _ascii_pat = None
 
@@ -32,9 +43,16 @@ def allowed(x):
     x = ord(x)
     return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
 
-def clean_xml_chars(unicode_string):
+def py_clean_xml_chars(unicode_string):
     return u''.join(filter(allowed, unicode_string))
 
+clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars
+
+def test_clean_xml_chars():
+    raw = u'asd\x02a\U00010437x\ud801b\udffe\ud802'
+    if native_clean_xml_chars(raw) != u'asda\U00010437xb':
+        raise ValueError('Failed to XML clean: %r' % raw)
+
 
 # Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
 # Removes HTML or XML character references and entities from a text string.
diff --git a/src/calibre/utils/speedup.c b/src/calibre/utils/speedup.c
index 460c33a657..d989cb9f0d 100644
--- a/src/calibre/utils/speedup.c
+++ b/src/calibre/utils/speedup.c
@@ -316,6 +316,43 @@ error:
 	return Py_BuildValue("NII", ans, state, codep);
 }
 
+static PyObject*
+clean_xml_chars(PyObject *self, PyObject *text) {
+#if PY_VERSION_HEX >= 0x03030000 
+#error Not implemented for python >= 3.3
+#endif
+    Py_UNICODE *buf = NULL, ch;
+    PyUnicodeObject *ans = NULL;
+    Py_ssize_t i = 0, j = 0;
+    if (!PyUnicode_Check(text)) {
+        PyErr_SetString(PyExc_TypeError, "A unicode string is required");
+        return NULL;
+    }
+    ans = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, PyUnicode_GET_SIZE(text));
+    if (ans == NULL) return PyErr_NoMemory();
+    buf = ans->str;
+
+    for (; i < PyUnicode_GET_SIZE(text); i++) {
+        ch = PyUnicode_AS_UNICODE(text)[i];
+#ifdef Py_UNICODE_WIDE
+        if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) || ch == 9 || ch == 10 || ch == 13 || (0xe000 <= ch && ch <= 0xfffd) || (0xffff < ch && ch <= 0x10ffff)) 
+            buf[j++] = ch;
+#else
+        if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) || ch == 9 || ch == 10 || ch == 13 || (0xd000 <= ch && ch <= 0xfffd)) {
+            if (0xd800 <= ch && ch <= 0xdfff) {
+                // Test for valid surrogate pair
+                if (ch <= 0xdbff && i + 1 < PyUnicode_GET_SIZE(text) && 0xdc00 <= PyUnicode_AS_UNICODE(text)[i + 1] && PyUnicode_AS_UNICODE(text)[i+1] <= 0xdfff) {
+                    buf[j++] = ch; buf[j++] = PyUnicode_AS_UNICODE(text)[++i];
+                }
+            } else 
+                buf[j++] = ch;
+        }
+#endif
+    }
+    ans->length = j;
+    return (PyObject*)ans;
+}
+
 static PyMethodDef speedup_methods[] = {
     {"parse_date", speedup_parse_date, METH_VARARGS,
         "parse_date()\n\nParse ISO dates faster."
@@ -351,6 +388,10 @@ static PyMethodDef speedup_methods[] = {
 		"utf8_decode(data, [, state=0, codep=0)\n\nDecode an UTF-8 bytestring, using a strict UTF-8 decoder, that unlike python does not allow orphaned surrogates. Returns a unicode object and the state."
 	},
 
+    {"clean_xml_chars", clean_xml_chars, METH_O,
+        "clean_xml_chars(unicode_object)\n\nRemove codepoints in unicode_object that are not allowed in XML"
+    },
+
     {NULL, NULL, 0, NULL}
 };