From 1ba8e6446867c130ffd3c6583afe90c687e5da9b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 17 Feb 2020 03:15:53 +0530 Subject: [PATCH] py3: Fix clean_xml_text implementation Fixes #1863517 [Characters are dropped from title](https://bugs.launchpad.net/calibre/+bug/1863517) --- src/calibre/utils/cleantext.py | 16 +++++++--------- src/calibre/utils/speedup.c | 7 +++++-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 2d62e06858..d7c9d74c20 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -8,15 +8,13 @@ from polyglot.builtins import codepoint_to_chr, map, range, filter from polyglot.html_entities import name2codepoint from calibre.constants import plugins, preferred_encoding -try: - _ncxc = plugins['speedup'][0].clean_xml_chars -except AttributeError: - native_clean_xml_chars = None -else: - def native_clean_xml_chars(x): - if isinstance(x, bytes): - x = x.decode(preferred_encoding) - return _ncxc(x) +_ncxc = plugins['speedup'][0].clean_xml_chars + + +def native_clean_xml_chars(x): + if isinstance(x, bytes): + x = x.decode(preferred_encoding) + return _ncxc(x) def ascii_pat(for_binary=False): diff --git a/src/calibre/utils/speedup.c b/src/calibre/utils/speedup.c index ca990150d1..30b82638c7 100644 --- a/src/calibre/utils/speedup.c +++ b/src/calibre/utils/speedup.c @@ -394,8 +394,11 @@ clean_xml_chars(PyObject *self, PyObject *text) { // based on https://en.wikipedia.org/wiki/Valid_characters_in_XML#Non-restricted_characters // python 3.3+ unicode strings never contain surrogate pairs, since if // they did, they would be represented as UTF-32 - if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) || - ch == 9 || ch == 10 || ch == 13 || + if ((0x20 <= ch && ch <= 0x7e) || + ch == 0x9 || ch == 0xa || ch == 0xd || ch == 0x85 || + (0x00A0 <= ch && ch <= 0xD7FF) || + (0xE000 <= ch && ch <= 0xFDCF) || + (0xFDF0 <= ch && ch <= 0xFFFD) || (0xffff < ch && ch <= 0x10ffff)) { PyUnicode_WRITE(text_kind, result_text, target_i, ch); target_i += 1;