Replace various other uses of etree.fromstring

etree.fromstring is terminally broken on Windows with unicode objects with non-BMP chars.
2025-09-29 15:31:08 -04:00 · 2025-09-29 22:19:43 +05:30 · 2025-09-29 22:19:43 +05:30 · 07068b3049
commit 07068b3049
parent a4c0f08a0d
7 changed files with 20 additions and 26 deletions
--- a/src/calibre/ebooks/conversion/plugins/txt_input.py
+++ b/src/calibre/ebooks/conversion/plugins/txt_input.py
@ -164,11 +164,11 @@ class TXTInput(InputFormatPlugin):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + b'\n\n'
            if os.path.exists('metadata.opf'):
-                from lxml import etree
+                from calibre.utils.xml_parse import safe_xml_fromstring
                with open('metadata.opf', 'rb') as mf:
                    raw = mf.read()
                try:
-                    root = etree.fromstring(raw)
+                    root = safe_xml_fromstring(raw)
                except Exception:
                    pass
                else:
--- a/src/calibre/ebooks/metadata/odt.py
+++ b/src/calibre/ebooks/metadata/odt.py
@ -24,7 +24,7 @@ import json
 import os
 import re

-from lxml.etree import fromstring, tostring
+from lxml.etree import tostring
 from odf.draw import Frame as odFrame
 from odf.draw import Image as odImage
 from odf.namespaces import DCNS, METANS, OFFICENS
@ -34,6 +34,7 @@ from calibre.ebooks.metadata import MetaInformation, authors_to_string, check_is
 from calibre.utils.date import isoformat, parse_date
 from calibre.utils.imghdr import identify
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.zipfile import ZipFile, safe_replace
 from polyglot.builtins import as_unicode

@ -74,7 +75,7 @@ def get_metadata(stream, extract_cover=True):

    with ZipFile(stream) as zf:
        meta = zf.read('meta.xml')
-        root = fromstring(meta)
+        root = safe_xml_fromstring(meta)

        def find(field):
            ns, tag = fields[field]
@ -175,7 +176,7 @@ def set_metadata(stream, mi):


 def _set_metadata(raw, mi):
-    root = fromstring(raw)
+    root = safe_xml_fromstring(raw)
    namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
    nsrmap = {v: k for k, v in namespaces.items()}

--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -75,8 +75,7 @@ def XPath(x):


 def to_metadata(browser, log, entry_, timeout, running_a_test=False):  # {{{
-    from lxml import etree
-
+    from calibre.utils.xml_parse import safe_xml_fromstring
    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
@ -111,10 +110,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False):  # {{{
            with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
                f.write(raw)
                print('Book details saved to:', f.name, file=sys.stderr)
-        feed = etree.fromstring(
-            xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
-            parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
-        )
+        feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
        return entry(feed)[0]

    if isinstance(entry_, str):
@ -494,7 +490,7 @@ class GoogleBooks(Source):
        identifiers={},
        timeout=30
    ):
-        from lxml import etree
+        from calibre.utils.xml_parse import safe_xml_fromstring
        entry = XPath('//atom:entry')
        identifiers = identifiers.copy()
        br = self.browser
@ -525,10 +521,7 @@ class GoogleBooks(Source):
                return False, as_unicode(e)

            try:
-                feed = etree.fromstring(
-                    xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
-                    parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
-                )
+                feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
                return True, entry(feed)
            except Exception as e:
                log.exception('Failed to parse identify results')
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -1425,6 +1425,7 @@ class Page:
 class PDFDocument:

    def __init__(self, xml, opts, log):
+        from calibre.utils.xml_parse import safe_xml_fromstring
        # from calibre.rpdb import set_trace;  set_trace()

        self.opts, self.log = opts, log
@ -1435,8 +1436,7 @@ class PDFDocument:
        if self.opts.pdf_footer_regex is None:
            self.opts.pdf_footer_regex = ''  # Do nothing

-        parser = etree.XMLParser(recover=True)
-        self.root = etree.fromstring(xml, parser=parser)
+        self.root = safe_xml_fromstring(xml)
        idc = iter(range(sys.maxsize))
        self.stats = DocStats()

--- a/src/calibre/gui2/store/stores/litres_plugin.py
+++ b/src/calibre/gui2/store/stores/litres_plugin.py
@ -17,7 +17,6 @@ except ImportError:

 from contextlib import closing

-from lxml import etree
 from qt.core import QUrl

 from calibre import browser, prints, url_slash_cleaner
@ -27,6 +26,7 @@ from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.basic_config import BasicStoreConfig
 from calibre.gui2.store.search_result import SearchResult
 from calibre.gui2.store.web_store_dialog import WebStoreDialog
+from calibre.utils.xml_parse import safe_xml_fromstring


 class LitResStore(BasicStoreConfig, StorePlugin):
@ -65,7 +65,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
            ungzipResponse(r, br)
            raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]

-            doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
+            doc = safe_xml_fromstring(raw)
            for data in doc.xpath('//*[local-name() = "fb2-book"]'):
                if counter <= 0:
                    break
--- a/src/calibre/gui2/store/stores/manybooks_plugin.py
+++ b/src/calibre/gui2/store/stores/manybooks_plugin.py
@ -10,14 +10,13 @@ __docformat__ = 'restructuredtext en'
 import mimetypes
 from contextlib import closing

-from lxml import etree
-
 from calibre import browser
 from calibre.gui2.store.basic_config import BasicStoreConfig
 from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
 from calibre.gui2.store.search_result import SearchResult
 from calibre.utils.opensearch.description import Description
 from calibre.utils.opensearch.query import Query
+from calibre.utils.xml_parse import safe_xml_fromstring


 def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'):
@ -45,8 +44,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        raw_data = f.read()
-        raw_data = raw_data.decode('utf-8', 'replace')
-        doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
+        doc = safe_xml_fromstring(raw_data)
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break
@ -71,7 +69,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://

            # Follow the detail link to get the rest of the info.
            with closing(br.open(detail_href, timeout=timeout/4)) as df:
-                ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
+                ddoc = safe_xml_fromstring(df.read())
                ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
                if ddata:
                    ddata = ddata[0]
--- a/src/calibre/utils/img.py
+++ b/src/calibre/utils/img.py
@ -763,8 +763,10 @@ def read_text_from_container(container, target_lang=''):

 def read_alt_text_from_xmp(xmp, target_lang='') -> str:
    from lxml import etree
+
+    from calibre.utils.xml_parse import safe_xml_fromstring
    try:
-        root = etree.fromstring(xmp)
+        root = safe_xml_fromstring(xmp)
    except Exception:
        return ''
    # print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode())