This commit is contained in:
Kovid Goyal 2025-09-26 16:50:49 +05:30
parent a5fbde7f25
commit e2ef55140d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,6 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net> # License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
import sys
from lxml import etree from lxml import etree
# resolving of SYSTEM entities is turned off as entities can cause # resolving of SYSTEM entities is turned off as entities can cause
@ -23,16 +25,15 @@ def create_parser(recover, encoding=None):
def safe_xml_fromstring(string_or_bytes, recover=True): def safe_xml_fromstring(string_or_bytes, recover=True):
ans = fs(string_or_bytes, parser=create_parser(recover)) try:
if ans is None and recover: ans = fs(string_or_bytes, parser=create_parser(recover))
except etree.XMLSyntaxError:
# this happens on windows where if string_or_bytes is unicode and # this happens on windows where if string_or_bytes is unicode and
# contains non-BMP chars lxml chokes # contains non-BMP chars lxml chokes
if not isinstance(string_or_bytes, bytes): # ebook-convert file.epub .azw3 -m metadata.opf
string_or_bytes = string_or_bytes.encode('utf-8') if sys.platform != 'win32' or not isinstance(string_or_bytes, str):
ans = fs(string_or_bytes, parser=create_parser(True, encoding='utf-8')) raise
if ans is not None: ans = fs(string_or_bytes.encode('utf-8'), parser=create_parser(True, encoding='utf-8'))
return ans
ans = fs(string_or_bytes, parser=create_parser(False))
return ans return ans
@ -101,9 +102,10 @@ def find_tests():
def develop(): def develop():
import sys from calibre.ebooks.chardet import xml_to_unicode
# print(etree.tostring(fs('<r/>')).decode()) # print(etree.tostring(fs('<r/>')).decode())
print(etree.tostring(safe_xml_fromstring(open(sys.argv[-1], 'rb').read())).decode()) data = xml_to_unicode(open(sys.argv[-1], 'rb').read(), strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0]
print(etree.tostring(safe_xml_fromstring(data)).decode())
if __name__ == '__main__': if __name__ == '__main__':