py3: Fix clean_xml_text implementation

Fixes #1863517 [Characters are dropped from title](https://bugs.launchpad.net/calibre/+bug/1863517)
This commit is contained in:
Kovid Goyal 2020-02-17 03:15:53 +05:30
parent a91ed25d9b
commit 1ba8e64468
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 12 additions and 11 deletions

View File

@ -8,15 +8,13 @@ from polyglot.builtins import codepoint_to_chr, map, range, filter
from polyglot.html_entities import name2codepoint
from calibre.constants import plugins, preferred_encoding
try:
_ncxc = plugins['speedup'][0].clean_xml_chars
except AttributeError:
native_clean_xml_chars = None
else:
def native_clean_xml_chars(x):
if isinstance(x, bytes):
x = x.decode(preferred_encoding)
return _ncxc(x)
_ncxc = plugins['speedup'][0].clean_xml_chars
def native_clean_xml_chars(x):
if isinstance(x, bytes):
x = x.decode(preferred_encoding)
return _ncxc(x)
def ascii_pat(for_binary=False):

View File

@ -394,8 +394,11 @@ clean_xml_chars(PyObject *self, PyObject *text) {
// based on https://en.wikipedia.org/wiki/Valid_characters_in_XML#Non-restricted_characters
// python 3.3+ unicode strings never contain surrogate pairs, since if
// they did, they would be represented as UTF-32
if ((0x20 <= ch && ch <= 0xd7ff && ch != 0x7f) ||
ch == 9 || ch == 10 || ch == 13 ||
if ((0x20 <= ch && ch <= 0x7e) ||
ch == 0x9 || ch == 0xa || ch == 0xd || ch == 0x85 ||
(0x00A0 <= ch && ch <= 0xD7FF) ||
(0xE000 <= ch && ch <= 0xFDCF) ||
(0xFDF0 <= ch && ch <= 0xFFFD) ||
(0xffff < ch && ch <= 0x10ffff)) {
PyUnicode_WRITE(text_kind, result_text, target_i, ch);
target_i += 1;