py3: Fix clean_xml_text implementation

Fixes #1863517 [Characters are dropped from title](https://bugs.launchpad.net/calibre/+bug/1863517)
2025-07-09 03:04:10 -04:00 · 2020-02-17 03:15:53 +05:30 · 2020-02-17 03:15:53 +05:30 · 1ba8e64468
commit 1ba8e64468
parent a91ed25d9b
2 changed files with 12 additions and 11 deletions
--- a/src/calibre/utils/cleantext.py
+++ b/src/calibre/utils/cleantext.py
@ -8,15 +8,13 @@ from polyglot.builtins import codepoint_to_chr, map, range, filter
 from polyglot.html_entities import name2codepoint
 from calibre.constants import plugins, preferred_encoding

-try:
-    _ncxc = plugins['speedup'][0].clean_xml_chars
-except AttributeError:
-    native_clean_xml_chars = None
-else:
-    def native_clean_xml_chars(x):
-        if isinstance(x, bytes):
-            x = x.decode(preferred_encoding)
-        return _ncxc(x)
+_ncxc = plugins['speedup'][0].clean_xml_chars
+
+
+def native_clean_xml_chars(x):
+    if isinstance(x, bytes):
+        x = x.decode(preferred_encoding)
+    return _ncxc(x)


 def ascii_pat(for_binary=False):
--- a/src/calibre/utils/speedup.c
+++ b/src/calibre/utils/speedup.c
@ -394,8 +394,11 @@ clean_xml_chars(PyObject *self, PyObject *text) {
        // based on https://en.wikipedia.org/wiki/Valid_characters_in_XML#Non-restricted_characters
        // python 3.3+ unicode strings never contain surrogate pairs, since if
        // they did, they would be represented as UTF-32
-        if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) ||
-                ch == 9 || ch == 10 || ch == 13 ||
+        if ((0x20 <= ch && ch <= 0x7e) ||
+                ch == 0x9 || ch == 0xa || ch == 0xd || ch == 0x85 ||
+				(0x00A0 <= ch && ch <= 0xD7FF) ||
+				(0xE000 <= ch && ch <= 0xFDCF) ||
+				(0xFDF0 <= ch && ch <= 0xFFFD) ||
                (0xffff < ch && ch <= 0x10ffff)) {
            PyUnicode_WRITE(text_kind, result_text, target_i, ch);
            target_i += 1;