Replace various other uses of etree.fromstring

etree.fromstring is terminally broken on Windows with unicode objects
with non-BMP chars.
This commit is contained in:
Kovid Goyal 2025-09-29 22:19:43 +05:30
parent a4c0f08a0d
commit 07068b3049
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
7 changed files with 20 additions and 26 deletions

View File

@ -164,11 +164,11 @@ class TXTInput(InputFormatPlugin):
with open(x, 'rb') as tf:
txt += tf.read() + b'\n\n'
if os.path.exists('metadata.opf'):
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
with open('metadata.opf', 'rb') as mf:
raw = mf.read()
try:
root = etree.fromstring(raw)
root = safe_xml_fromstring(raw)
except Exception:
pass
else:

View File

@ -24,7 +24,7 @@ import json
import os
import re
from lxml.etree import fromstring, tostring
from lxml.etree import tostring
from odf.draw import Frame as odFrame
from odf.draw import Image as odImage
from odf.namespaces import DCNS, METANS, OFFICENS
@ -34,6 +34,7 @@ from calibre.ebooks.metadata import MetaInformation, authors_to_string, check_is
from calibre.utils.date import isoformat, parse_date
from calibre.utils.imghdr import identify
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.zipfile import ZipFile, safe_replace
from polyglot.builtins import as_unicode
@ -74,7 +75,7 @@ def get_metadata(stream, extract_cover=True):
with ZipFile(stream) as zf:
meta = zf.read('meta.xml')
root = fromstring(meta)
root = safe_xml_fromstring(meta)
def find(field):
ns, tag = fields[field]
@ -175,7 +176,7 @@ def set_metadata(stream, mi):
def _set_metadata(raw, mi):
root = fromstring(raw)
root = safe_xml_fromstring(raw)
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
nsrmap = {v: k for k, v in namespaces.items()}

View File

@ -75,8 +75,7 @@ def XPath(x):
def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
# total_results = XPath('//openSearch:totalResults')
# start_index = XPath('//openSearch:startIndex')
# items_per_page = XPath('//openSearch:itemsPerPage')
@ -111,10 +110,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
f.write(raw)
print('Book details saved to:', f.name, file=sys.stderr)
feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
)
feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
return entry(feed)[0]
if isinstance(entry_, str):
@ -494,7 +490,7 @@ class GoogleBooks(Source):
identifiers={},
timeout=30
):
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
entry = XPath('//atom:entry')
identifiers = identifiers.copy()
br = self.browser
@ -525,10 +521,7 @@ class GoogleBooks(Source):
return False, as_unicode(e)
try:
feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
)
feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
return True, entry(feed)
except Exception as e:
log.exception('Failed to parse identify results')

View File

@ -1425,6 +1425,7 @@ class Page:
class PDFDocument:
def __init__(self, xml, opts, log):
from calibre.utils.xml_parse import safe_xml_fromstring
# from calibre.rpdb import set_trace; set_trace()
self.opts, self.log = opts, log
@ -1435,8 +1436,7 @@ class PDFDocument:
if self.opts.pdf_footer_regex is None:
self.opts.pdf_footer_regex = '' # Do nothing
parser = etree.XMLParser(recover=True)
self.root = etree.fromstring(xml, parser=parser)
self.root = safe_xml_fromstring(xml)
idc = iter(range(sys.maxsize))
self.stats = DocStats()

View File

@ -17,7 +17,6 @@ except ImportError:
from contextlib import closing
from lxml import etree
from qt.core import QUrl
from calibre import browser, prints, url_slash_cleaner
@ -27,6 +26,7 @@ from calibre.gui2.store import StorePlugin
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog
from calibre.utils.xml_parse import safe_xml_fromstring
class LitResStore(BasicStoreConfig, StorePlugin):
@ -65,7 +65,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
ungzipResponse(r, br)
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
doc = safe_xml_fromstring(raw)
for data in doc.xpath('//*[local-name() = "fb2-book"]'):
if counter <= 0:
break

View File

@ -10,14 +10,13 @@ __docformat__ = 'restructuredtext en'
import mimetypes
from contextlib import closing
from lxml import etree
from calibre import browser
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
from calibre.utils.opensearch.description import Description
from calibre.utils.opensearch.query import Query
from calibre.utils.xml_parse import safe_xml_fromstring
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'):
@ -45,8 +44,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
br = browser()
with closing(br.open(url, timeout=timeout)) as f:
raw_data = f.read()
raw_data = raw_data.decode('utf-8', 'replace')
doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
doc = safe_xml_fromstring(raw_data)
for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0:
break
@ -71,7 +69,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
# Follow the detail link to get the rest of the info.
with closing(br.open(detail_href, timeout=timeout/4)) as df:
ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
ddoc = safe_xml_fromstring(df.read())
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
if ddata:
ddata = ddata[0]

View File

@ -763,8 +763,10 @@ def read_text_from_container(container, target_lang=''):
def read_alt_text_from_xmp(xmp, target_lang='') -> str:
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
try:
root = etree.fromstring(xmp)
root = safe_xml_fromstring(xmp)
except Exception:
return ''
# print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode())