mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-04-28 11:40:44 -04:00
When parsing XML always convert unicode to UTF-8
Apparently libxml does so internally anyway and does it in a buggy fashion. See https://bugs.launchpad.net/lxml/+bug/2125756
This commit is contained in:
parent
48147ebd52
commit
7d79ffaea7
@ -25,16 +25,12 @@ def create_parser(recover, encoding=None):
|
||||
|
||||
|
||||
def safe_xml_fromstring(string_or_bytes, recover=True):
|
||||
try:
|
||||
ans = fs(string_or_bytes, parser=create_parser(recover))
|
||||
except etree.XMLSyntaxError:
|
||||
# this happens on windows where if string_or_bytes is unicode and
|
||||
# contains non-BMP chars lxml chokes
|
||||
if isinstance(string_or_bytes, str):
|
||||
# libxml2 anyway converts to UTF-8 to parse internally
|
||||
# and does so with bugs, see
|
||||
# https://bugs.launchpad.net/lxml/+bug/2125756
|
||||
if sys.platform != 'win32' or not isinstance(string_or_bytes, str):
|
||||
raise
|
||||
ans = fs(string_or_bytes.encode('utf-8'), parser=create_parser(True, encoding='utf-8'))
|
||||
return ans
|
||||
string_or_bytes = string_or_bytes.encode('utf-8')
|
||||
return fs(string_or_bytes, parser=create_parser(recover))
|
||||
|
||||
|
||||
def unsafe_xml_fromstring(string_or_bytes):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user