diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index d6d43a644c..350d0f69ff 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -164,11 +164,11 @@ class TXTInput(InputFormatPlugin): with open(x, 'rb') as tf: txt += tf.read() + b'\n\n' if os.path.exists('metadata.opf'): - from lxml import etree + from calibre.utils.xml_parse import safe_xml_fromstring with open('metadata.opf', 'rb') as mf: raw = mf.read() try: - root = etree.fromstring(raw) + root = safe_xml_fromstring(raw) except Exception: pass else: diff --git a/src/calibre/ebooks/metadata/odt.py b/src/calibre/ebooks/metadata/odt.py index 8ba5211b1a..6df9d90ca9 100644 --- a/src/calibre/ebooks/metadata/odt.py +++ b/src/calibre/ebooks/metadata/odt.py @@ -24,7 +24,7 @@ import json import os import re -from lxml.etree import fromstring, tostring +from lxml.etree import tostring from odf.draw import Frame as odFrame from odf.draw import Image as odImage from odf.namespaces import DCNS, METANS, OFFICENS @@ -34,6 +34,7 @@ from calibre.ebooks.metadata import MetaInformation, authors_to_string, check_is from calibre.utils.date import isoformat, parse_date from calibre.utils.imghdr import identify from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 +from calibre.utils.xml_parse import safe_xml_fromstring from calibre.utils.zipfile import ZipFile, safe_replace from polyglot.builtins import as_unicode @@ -74,7 +75,7 @@ def get_metadata(stream, extract_cover=True): with ZipFile(stream) as zf: meta = zf.read('meta.xml') - root = fromstring(meta) + root = safe_xml_fromstring(meta) def find(field): ns, tag = fields[field] @@ -175,7 +176,7 @@ def set_metadata(stream, mi): def _set_metadata(raw, mi): - root = fromstring(raw) + root = safe_xml_fromstring(raw) namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS} nsrmap = {v: k for k, v in namespaces.items()} diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 287f14ed2a..2797dbe0cf 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -75,8 +75,7 @@ def XPath(x): def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{ - from lxml import etree - + from calibre.utils.xml_parse import safe_xml_fromstring # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') @@ -111,10 +110,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{ with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f: f.write(raw) print('Book details saved to:', f.name, file=sys.stderr) - feed = etree.fromstring( - xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], - parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) - ) + feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) return entry(feed)[0] if isinstance(entry_, str): @@ -494,7 +490,7 @@ class GoogleBooks(Source): identifiers={}, timeout=30 ): - from lxml import etree + from calibre.utils.xml_parse import safe_xml_fromstring entry = XPath('//atom:entry') identifiers = identifiers.copy() br = self.browser @@ -525,10 +521,7 @@ class GoogleBooks(Source): return False, as_unicode(e) try: - feed = etree.fromstring( - xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], - parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False) - ) + feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) return True, entry(feed) except Exception as e: log.exception('Failed to parse identify results') diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 96d267080d..7d11be506a 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -1425,6 +1425,7 @@ class Page: class PDFDocument: def __init__(self, xml, opts, log): + from calibre.utils.xml_parse import safe_xml_fromstring # from calibre.rpdb import set_trace; set_trace() self.opts, self.log = opts, log @@ -1435,8 +1436,7 @@ class PDFDocument: if self.opts.pdf_footer_regex is None: self.opts.pdf_footer_regex = '' # Do nothing - parser = etree.XMLParser(recover=True) - self.root = etree.fromstring(xml, parser=parser) + self.root = safe_xml_fromstring(xml) idc = iter(range(sys.maxsize)) self.stats = DocStats() diff --git a/src/calibre/gui2/store/stores/litres_plugin.py b/src/calibre/gui2/store/stores/litres_plugin.py index a15e26be79..288062c01e 100644 --- a/src/calibre/gui2/store/stores/litres_plugin.py +++ b/src/calibre/gui2/store/stores/litres_plugin.py @@ -17,7 +17,6 @@ except ImportError: from contextlib import closing -from lxml import etree from qt.core import QUrl from calibre import browser, prints, url_slash_cleaner @@ -27,6 +26,7 @@ from calibre.gui2.store import StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog +from calibre.utils.xml_parse import safe_xml_fromstring class LitResStore(BasicStoreConfig, StorePlugin): @@ -65,7 +65,7 @@ class LitResStore(BasicStoreConfig, StorePlugin): ungzipResponse(r, br) raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] - doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) + doc = safe_xml_fromstring(raw) for data in doc.xpath('//*[local-name() = "fb2-book"]'): if counter <= 0: break diff --git a/src/calibre/gui2/store/stores/manybooks_plugin.py b/src/calibre/gui2/store/stores/manybooks_plugin.py index ec174949d5..dd355a1f5e 100644 --- a/src/calibre/gui2/store/stores/manybooks_plugin.py +++ b/src/calibre/gui2/store/stores/manybooks_plugin.py @@ -10,14 +10,13 @@ __docformat__ = 'restructuredtext en' import mimetypes from contextlib import closing -from lxml import etree - from calibre import browser from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore from calibre.gui2.store.search_result import SearchResult from calibre.utils.opensearch.description import Description from calibre.utils.opensearch.query import Query +from calibre.utils.xml_parse import safe_xml_fromstring def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'): @@ -45,8 +44,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http:// br = browser() with closing(br.open(url, timeout=timeout)) as f: raw_data = f.read() - raw_data = raw_data.decode('utf-8', 'replace') - doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) + doc = safe_xml_fromstring(raw_data) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break @@ -71,7 +69,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http:// # Follow the detail link to get the rest of the info. with closing(br.open(detail_href, timeout=timeout/4)) as df: - ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) + ddoc = safe_xml_fromstring(df.read()) ddata = ddoc.xpath('//*[local-name() = "entry"][1]') if ddata: ddata = ddata[0] diff --git a/src/calibre/utils/img.py b/src/calibre/utils/img.py index 6143000280..eb5aadfa50 100644 --- a/src/calibre/utils/img.py +++ b/src/calibre/utils/img.py @@ -763,8 +763,10 @@ def read_text_from_container(container, target_lang=''): def read_alt_text_from_xmp(xmp, target_lang='') -> str: from lxml import etree + + from calibre.utils.xml_parse import safe_xml_fromstring try: - root = etree.fromstring(xmp) + root = safe_xml_fromstring(xmp) except Exception: return '' # print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode())