Replace various other uses of etree.fromstring

etree.fromstring is terminally broken on Windows with unicode objects
with non-BMP chars.
This commit is contained in:
Kovid Goyal 2025-09-29 22:19:43 +05:30
parent a4c0f08a0d
commit 07068b3049
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
7 changed files with 20 additions and 26 deletions

View File

@ -164,11 +164,11 @@ class TXTInput(InputFormatPlugin):
with open(x, 'rb') as tf: with open(x, 'rb') as tf:
txt += tf.read() + b'\n\n' txt += tf.read() + b'\n\n'
if os.path.exists('metadata.opf'): if os.path.exists('metadata.opf'):
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
with open('metadata.opf', 'rb') as mf: with open('metadata.opf', 'rb') as mf:
raw = mf.read() raw = mf.read()
try: try:
root = etree.fromstring(raw) root = safe_xml_fromstring(raw)
except Exception: except Exception:
pass pass
else: else:

View File

@ -24,7 +24,7 @@ import json
import os import os
import re import re
from lxml.etree import fromstring, tostring from lxml.etree import tostring
from odf.draw import Frame as odFrame from odf.draw import Frame as odFrame
from odf.draw import Image as odImage from odf.draw import Image as odImage
from odf.namespaces import DCNS, METANS, OFFICENS from odf.namespaces import DCNS, METANS, OFFICENS
@ -34,6 +34,7 @@ from calibre.ebooks.metadata import MetaInformation, authors_to_string, check_is
from calibre.utils.date import isoformat, parse_date from calibre.utils.date import isoformat, parse_date
from calibre.utils.imghdr import identify from calibre.utils.imghdr import identify
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.zipfile import ZipFile, safe_replace from calibre.utils.zipfile import ZipFile, safe_replace
from polyglot.builtins import as_unicode from polyglot.builtins import as_unicode
@ -74,7 +75,7 @@ def get_metadata(stream, extract_cover=True):
with ZipFile(stream) as zf: with ZipFile(stream) as zf:
meta = zf.read('meta.xml') meta = zf.read('meta.xml')
root = fromstring(meta) root = safe_xml_fromstring(meta)
def find(field): def find(field):
ns, tag = fields[field] ns, tag = fields[field]
@ -175,7 +176,7 @@ def set_metadata(stream, mi):
def _set_metadata(raw, mi): def _set_metadata(raw, mi):
root = fromstring(raw) root = safe_xml_fromstring(raw)
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS} namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
nsrmap = {v: k for k, v in namespaces.items()} nsrmap = {v: k for k, v in namespaces.items()}

View File

@ -75,8 +75,7 @@ def XPath(x):
def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
# total_results = XPath('//openSearch:totalResults') # total_results = XPath('//openSearch:totalResults')
# start_index = XPath('//openSearch:startIndex') # start_index = XPath('//openSearch:startIndex')
# items_per_page = XPath('//openSearch:itemsPerPage') # items_per_page = XPath('//openSearch:itemsPerPage')
@ -111,10 +110,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f: with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
f.write(raw) f.write(raw)
print('Book details saved to:', f.name, file=sys.stderr) print('Book details saved to:', f.name, file=sys.stderr)
feed = etree.fromstring( feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
)
return entry(feed)[0] return entry(feed)[0]
if isinstance(entry_, str): if isinstance(entry_, str):
@ -494,7 +490,7 @@ class GoogleBooks(Source):
identifiers={}, identifiers={},
timeout=30 timeout=30
): ):
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
entry = XPath('//atom:entry') entry = XPath('//atom:entry')
identifiers = identifiers.copy() identifiers = identifiers.copy()
br = self.browser br = self.browser
@ -525,10 +521,7 @@ class GoogleBooks(Source):
return False, as_unicode(e) return False, as_unicode(e)
try: try:
feed = etree.fromstring( feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
)
return True, entry(feed) return True, entry(feed)
except Exception as e: except Exception as e:
log.exception('Failed to parse identify results') log.exception('Failed to parse identify results')

View File

@ -1425,6 +1425,7 @@ class Page:
class PDFDocument: class PDFDocument:
def __init__(self, xml, opts, log): def __init__(self, xml, opts, log):
from calibre.utils.xml_parse import safe_xml_fromstring
# from calibre.rpdb import set_trace; set_trace() # from calibre.rpdb import set_trace; set_trace()
self.opts, self.log = opts, log self.opts, self.log = opts, log
@ -1435,8 +1436,7 @@ class PDFDocument:
if self.opts.pdf_footer_regex is None: if self.opts.pdf_footer_regex is None:
self.opts.pdf_footer_regex = '' # Do nothing self.opts.pdf_footer_regex = '' # Do nothing
parser = etree.XMLParser(recover=True) self.root = safe_xml_fromstring(xml)
self.root = etree.fromstring(xml, parser=parser)
idc = iter(range(sys.maxsize)) idc = iter(range(sys.maxsize))
self.stats = DocStats() self.stats = DocStats()

View File

@ -17,7 +17,6 @@ except ImportError:
from contextlib import closing from contextlib import closing
from lxml import etree
from qt.core import QUrl from qt.core import QUrl
from calibre import browser, prints, url_slash_cleaner from calibre import browser, prints, url_slash_cleaner
@ -27,6 +26,7 @@ from calibre.gui2.store import StorePlugin
from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog from calibre.gui2.store.web_store_dialog import WebStoreDialog
from calibre.utils.xml_parse import safe_xml_fromstring
class LitResStore(BasicStoreConfig, StorePlugin): class LitResStore(BasicStoreConfig, StorePlugin):
@ -65,7 +65,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
ungzipResponse(r, br) ungzipResponse(r, br)
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) doc = safe_xml_fromstring(raw)
for data in doc.xpath('//*[local-name() = "fb2-book"]'): for data in doc.xpath('//*[local-name() = "fb2-book"]'):
if counter <= 0: if counter <= 0:
break break

View File

@ -10,14 +10,13 @@ __docformat__ = 'restructuredtext en'
import mimetypes import mimetypes
from contextlib import closing from contextlib import closing
from lxml import etree
from calibre import browser from calibre import browser
from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
from calibre.utils.opensearch.description import Description from calibre.utils.opensearch.description import Description
from calibre.utils.opensearch.query import Query from calibre.utils.opensearch.query import Query
from calibre.utils.xml_parse import safe_xml_fromstring
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'): def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'):
@ -45,8 +44,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
br = browser() br = browser()
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
raw_data = f.read() raw_data = f.read()
raw_data = raw_data.decode('utf-8', 'replace') doc = safe_xml_fromstring(raw_data)
doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
for data in doc.xpath('//*[local-name() = "entry"]'): for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0: if counter <= 0:
break break
@ -71,7 +69,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
# Follow the detail link to get the rest of the info. # Follow the detail link to get the rest of the info.
with closing(br.open(detail_href, timeout=timeout/4)) as df: with closing(br.open(detail_href, timeout=timeout/4)) as df:
ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) ddoc = safe_xml_fromstring(df.read())
ddata = ddoc.xpath('//*[local-name() = "entry"][1]') ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
if ddata: if ddata:
ddata = ddata[0] ddata = ddata[0]

View File

@ -763,8 +763,10 @@ def read_text_from_container(container, target_lang=''):
def read_alt_text_from_xmp(xmp, target_lang='') -> str: def read_alt_text_from_xmp(xmp, target_lang='') -> str:
from lxml import etree from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
try: try:
root = etree.fromstring(xmp) root = safe_xml_fromstring(xmp)
except Exception: except Exception:
return '' return ''
# print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode()) # print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode())