When parsing XML always convert unicode to UTF-8

Apparently libxml does so internally anyway and does it in a buggy
fashion. See https://bugs.launchpad.net/lxml/+bug/2125756
This commit is contained in:
Kovid Goyal 2025-09-28 14:11:32 +05:30
parent 48147ebd52
commit 7d79ffaea7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -25,16 +25,12 @@ def create_parser(recover, encoding=None):
def safe_xml_fromstring(string_or_bytes, recover=True):
try:
ans = fs(string_or_bytes, parser=create_parser(recover))
except etree.XMLSyntaxError:
# this happens on windows where if string_or_bytes is unicode and
# contains non-BMP chars lxml chokes
if isinstance(string_or_bytes, str):
# libxml2 anyway converts to UTF-8 to parse internally
# and does so with bugs, see
# https://bugs.launchpad.net/lxml/+bug/2125756
if sys.platform != 'win32' or not isinstance(string_or_bytes, str):
raise
ans = fs(string_or_bytes.encode('utf-8'), parser=create_parser(True, encoding='utf-8'))
return ans
string_or_bytes = string_or_bytes.encode('utf-8')
return fs(string_or_bytes, parser=create_parser(recover))
def unsafe_xml_fromstring(string_or_bytes):