From e2ef55140d32015b31c4ef7b28ef95dd3dbda6a2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 26 Sep 2025 16:50:49 +0530 Subject: [PATCH] Fix #2125742 [Error during convert to AZW3](https://bugs.launchpad.net/calibre/+bug/2125742) --- src/calibre/utils/xml_parse.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/calibre/utils/xml_parse.py b/src/calibre/utils/xml_parse.py index 062d253c84..9cbbf0e638 100644 --- a/src/calibre/utils/xml_parse.py +++ b/src/calibre/utils/xml_parse.py @@ -1,6 +1,8 @@ #!/usr/bin/env python # License: GPL v3 Copyright: 2019, Kovid Goyal +import sys + from lxml import etree # resolving of SYSTEM entities is turned off as entities can cause @@ -23,16 +25,15 @@ def create_parser(recover, encoding=None): def safe_xml_fromstring(string_or_bytes, recover=True): - ans = fs(string_or_bytes, parser=create_parser(recover)) - if ans is None and recover: + try: + ans = fs(string_or_bytes, parser=create_parser(recover)) + except etree.XMLSyntaxError: # this happens on windows where if string_or_bytes is unicode and # contains non-BMP chars lxml chokes - if not isinstance(string_or_bytes, bytes): - string_or_bytes = string_or_bytes.encode('utf-8') - ans = fs(string_or_bytes, parser=create_parser(True, encoding='utf-8')) - if ans is not None: - return ans - ans = fs(string_or_bytes, parser=create_parser(False)) + # ebook-convert file.epub .azw3 -m metadata.opf + if sys.platform != 'win32' or not isinstance(string_or_bytes, str): + raise + ans = fs(string_or_bytes.encode('utf-8'), parser=create_parser(True, encoding='utf-8')) return ans @@ -101,9 +102,10 @@ def find_tests(): def develop(): - import sys + from calibre.ebooks.chardet import xml_to_unicode # print(etree.tostring(fs('')).decode()) - print(etree.tostring(safe_xml_fromstring(open(sys.argv[-1], 'rb').read())).decode()) + data = xml_to_unicode(open(sys.argv[-1], 'rb').read(), strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] + print(etree.tostring(safe_xml_fromstring(data)).decode()) if __name__ == '__main__':