From 68febe94ca2baf2a0979668b7b61a1e3b0432f0f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 29 Dec 2019 18:01:43 +0530
Subject: [PATCH] Do not resolve entities when parsing XML

Resolving entities is dangerous since lxml will actually
read file:// URLs in entity definitions. Fixes #1857800 [Private bug](https://bugs.launchpad.net/calibre/+bug/1857800)
---
 src/calibre/devices/prs505/sony_cache.py      | 17 +++++------
 src/calibre/ebooks/__init__.py                |  4 +--
 .../ebooks/conversion/plugins/epub_input.py   |  7 +++--
 .../ebooks/conversion/plugins/fb2_input.py    | 15 ++++------
 .../ebooks/conversion/plugins/lit_input.py    |  4 +--
 .../ebooks/conversion/plugins/lrf_input.py    | 14 +++-------
 .../ebooks/conversion/plugins/rtf_input.py    |  6 ++--
 .../ebooks/conversion/plugins/snb_input.py    |  8 +++---
 src/calibre/ebooks/docx/container.py          |  3 +-
 src/calibre/ebooks/docx/dump.py               |  3 +-
 src/calibre/ebooks/fb2/fb2ml.py               |  3 +-
 src/calibre/ebooks/metadata/docx.py           |  7 ++---
 src/calibre/ebooks/metadata/epub.py           |  8 ++----
 src/calibre/ebooks/metadata/fb2.py            |  6 ++--
 src/calibre/ebooks/metadata/lrx.py            |  4 +--
 src/calibre/ebooks/metadata/opf2.py           |  3 +-
 src/calibre/ebooks/metadata/opf3_test.py      |  7 ++---
 src/calibre/ebooks/metadata/snb.py            |  4 +--
 src/calibre/ebooks/metadata/sources/douban.py |  8 ++++--
 src/calibre/ebooks/metadata/sources/google.py |  8 +++---
 src/calibre/ebooks/metadata/toc.py            |  4 +--
 src/calibre/ebooks/metadata/utils.py          |  6 ++--
 src/calibre/ebooks/metadata/xmp.py            |  5 ++--
 src/calibre/ebooks/mobi/writer8/toc.py        |  5 ++--
 src/calibre/ebooks/odt/input.py               |  3 +-
 src/calibre/ebooks/oeb/base.py                |  6 ++--
 src/calibre/ebooks/oeb/parse_utils.py         | 28 +++++++------------
 .../ebooks/oeb/polish/check/parsing.py        |  6 ++--
 src/calibre/ebooks/oeb/polish/container.py    |  8 +++---
 src/calibre/ebooks/oeb/polish/parsing.py      |  6 ++--
 src/calibre/ebooks/oeb/reader.py              | 14 ++++------
 src/calibre/ebooks/oeb/transforms/cover.py    |  4 +--
 src/calibre/ebooks/pdf/pdftohtml.py           |  4 +--
 src/calibre/ebooks/pdf/reflow.py              |  4 +--
 src/calibre/ebooks/pml/pmlml.py               |  3 +-
 src/calibre/ebooks/rtf/rtfml.py               |  3 +-
 src/calibre/ebooks/snb/snbml.py               |  3 +-
 src/calibre/ebooks/txt/txtml.py               |  3 +-
 src/calibre/gui2/dialogs/opml.py              |  3 +-
 src/calibre/gui2/store/opensearch_store.py    |  5 ++--
 .../gui2/store/stores/gutenberg_plugin.py     |  6 ++--
 .../gui2/store/stores/litres_plugin.py        |  5 ++--
 .../gui2/store/stores/manybooks_plugin.py     |  6 ++--
 .../gui2/store/stores/xinxii_plugin.py        |  4 +--
 src/calibre/gui2/tweak_book/diff/view.py      |  3 +-
 .../library/catalogs/epub_mobi_builder.py     |  5 ++--
 src/calibre/spell/import_from.py              |  5 ++--
 src/calibre/srv/opds.py                       |  5 ++--
 src/calibre/utils/opensearch/description.py   |  5 ++--
 src/calibre/utils/xml_parse.py                | 19 +++++++++++++
 src/calibre/web/feeds/recipes/collection.py   |  5 ++--
 51 files changed, 166 insertions(+), 164 deletions(-)
 create mode 100644 src/calibre/utils/xml_parse.py

diff --git a/src/calibre/devices/prs505/sony_cache.py b/src/calibre/devices/prs505/sony_cache.py
index 5c0d1677c4..85a0002165 100644
--- a/src/calibre/devices/prs505/sony_cache.py
+++ b/src/calibre/devices/prs505/sony_cache.py
@@ -92,7 +92,7 @@ def uuid():
 class XMLCache(object):
 
     def __init__(self, paths, ext_paths, prefixes, use_author_sort):
-        from lxml import etree
+        from calibre.utils.xml_parse import safe_xml_fromstring
 
         if DEBUG:
             debug_print('Building XMLCache...', paths)
@@ -101,7 +101,6 @@ class XMLCache(object):
         self.use_author_sort = use_author_sort
 
         # Parse XML files {{{
-        parser = etree.XMLParser(recover=True)
         self.roots = {}
         for source_id, path in paths.items():
             if source_id == 0:
@@ -116,10 +115,9 @@ class XMLCache(object):
                     with lopen(path, 'rb') as f:
                         raw = f.read()
 
-            self.roots[source_id] = etree.fromstring(xml_to_unicode(
-                        raw, strip_encoding_pats=True, assume_utf8=True,
-                        verbose=DEBUG)[0],
-                        parser=parser)
+            self.roots[source_id] = safe_xml_fromstring(
+                xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, verbose=DEBUG)[0]
+            )
             if self.roots[source_id] is None:
                 raise Exception(('The SONY database at %r is corrupted. Try '
                         ' disconnecting and reconnecting your reader.')%path)
@@ -136,10 +134,9 @@ class XMLCache(object):
             if os.access(path, os.W_OK):
                 try:
                     with lopen(path, 'rb') as f:
-                        self.ext_roots[source_id] = etree.fromstring(
-                                xml_to_unicode(f.read(),
-                                    strip_encoding_pats=True, assume_utf8=True,
-                                    verbose=DEBUG)[0], parser=parser)
+                        self.ext_roots[source_id] = safe_xml_fromstring(
+                            xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True, verbose=DEBUG)[0]
+                        )
                         self.ext_paths[source_id] = path
                 except:
                     pass
diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py
index 66d77c0e8e..c240ded56c 100644
--- a/src/calibre/ebooks/__init__.py
+++ b/src/calibre/ebooks/__init__.py
@@ -51,9 +51,9 @@ def return_raster_image(path):
 
 
 def extract_cover_from_embedded_svg(html, base, log):
-    from lxml import etree
     from calibre.ebooks.oeb.base import XPath, SVG, XLINK
-    root = etree.fromstring(html)
+    from calibre.utils.xml_parse import safe_xml_fromstring
+    root = safe_xml_fromstring(html)
 
     svg = XPath('//svg:svg')(root)
     if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
diff --git a/src/calibre/ebooks/conversion/plugins/epub_input.py b/src/calibre/ebooks/conversion/plugins/epub_input.py
index 48723dc5b5..7c60b0e606 100644
--- a/src/calibre/ebooks/conversion/plugins/epub_input.py
+++ b/src/calibre/ebooks/conversion/plugins/epub_input.py
@@ -231,7 +231,7 @@ class EPUBInput(InputFormatPlugin):
         return removed
 
     def find_opf(self):
-        from lxml import etree
+        from calibre.utils.xml_parse import safe_xml_fromstring
 
         def attr(n, attr):
             for k, v in n.attrib.items():
@@ -239,7 +239,7 @@ class EPUBInput(InputFormatPlugin):
                     return v
         try:
             with lopen('META-INF/container.xml', 'rb') as f:
-                root = etree.fromstring(f.read())
+                root = safe_xml_fromstring(f.read())
                 for r in root.xpath('//*[local-name()="rootfile"]'):
                     if attr(r, 'media-type') != "application/oebps-package+xml":
                         continue
@@ -356,12 +356,13 @@ class EPUBInput(InputFormatPlugin):
         from calibre.ebooks.oeb.polish.parsing import parse
         from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
         from calibre.ebooks.oeb.polish.toc import first_child
+        from calibre.utils.xml_parse import safe_xml_fromstring
         from tempfile import NamedTemporaryFile
         with lopen(nav_path, 'rb') as f:
             raw = f.read()
         raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
         root = parse(raw, log=log)
-        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
+        ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
         navmap = ncx[0]
         et = '{%s}type' % EPUB_NS
         bn = os.path.basename(nav_path)
diff --git a/src/calibre/ebooks/conversion/plugins/fb2_input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py
index 6ac300b655..d82220d021 100644
--- a/src/calibre/ebooks/conversion/plugins/fb2_input.py
+++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py
@@ -39,10 +39,11 @@ class FB2Input(InputFormatPlugin):
     def convert(self, stream, options, file_ext, log,
                 accelerators):
         from lxml import etree
+        from calibre.utils.xml_parse import safe_xml_fromstring
         from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
         from calibre.ebooks.metadata.opf2 import OPFCreator
         from calibre.ebooks.metadata.meta import get_metadata
-        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
+        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
         from calibre.ebooks.chardet import xml_to_unicode
         self.log = log
         log.debug('Parsing XML...')
@@ -51,15 +52,9 @@ class FB2Input(InputFormatPlugin):
         raw = xml_to_unicode(raw, strip_encoding_pats=True,
             assume_utf8=True, resolve_entities=True)[0]
         try:
-            doc = etree.fromstring(raw)
+            doc = safe_xml_fromstring(raw)
         except etree.XMLSyntaxError:
-            try:
-                doc = etree.fromstring(raw, parser=RECOVER_PARSER)
-                if doc is None:
-                    raise Exception('parse failed')
-            except:
-                doc = etree.fromstring(raw.replace('& ', '&amp;'),
-                        parser=RECOVER_PARSER)
+            doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
         if doc is None:
             raise ValueError('The FB2 file is not valid XML')
         doc = ensure_namespace(doc)
@@ -99,7 +94,7 @@ class FB2Input(InputFormatPlugin):
             ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                     re.DOTALL).sub('', ss)
 
-        styledoc = etree.fromstring(ss)
+        styledoc = safe_xml_fromstring(ss)
 
         transform = etree.XSLT(styledoc)
         result = transform(doc)
diff --git a/src/calibre/ebooks/conversion/plugins/lit_input.py b/src/calibre/ebooks/conversion/plugins/lit_input.py
index aa8f0c2925..96f1867faf 100644
--- a/src/calibre/ebooks/conversion/plugins/lit_input.py
+++ b/src/calibre/ebooks/conversion/plugins/lit_input.py
@@ -43,7 +43,7 @@ class LITInput(InputFormatPlugin):
                     from calibre.ebooks.txt.processor import convert_basic, \
                         separate_paragraphs_single_line
                     from calibre.ebooks.chardet import xml_to_unicode
-                    from lxml import etree
+                    from calibre.utils.xml_parse import safe_xml_fromstring
                     import copy
                     self.log('LIT file with all text in singe <pre> tag detected')
                     html = separate_paragraphs_single_line(pre.text)
@@ -55,7 +55,7 @@ class LITInput(InputFormatPlugin):
                         # SmartyPants skips text inside <pre> tags
                         from calibre.ebooks.conversion.preprocess import smarten_punctuation
                         html = smarten_punctuation(html, self.log)
-                    root = etree.fromstring(html)
+                    root = safe_xml_fromstring(html)
                     body = XPath('//h:body')(root)
                     pre.tag = XHTML('div')
                     pre.text = ''
diff --git a/src/calibre/ebooks/conversion/plugins/lrf_input.py b/src/calibre/ebooks/conversion/plugins/lrf_input.py
index e69654540d..d40def2f1c 100644
--- a/src/calibre/ebooks/conversion/plugins/lrf_input.py
+++ b/src/calibre/ebooks/conversion/plugins/lrf_input.py
@@ -20,25 +20,19 @@ class LRFInput(InputFormatPlugin):
 
     def convert(self, stream, options, file_ext, log,
                 accelerators):
-        from lxml import etree
         from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
                 Canvas, ImageBlock, RuledLine)
         self.log = log
         self.log('Generating XML')
         from calibre.ebooks.lrf.lrfparser import LRFDocument
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        from lxml import etree
         d = LRFDocument(stream)
         d.parse()
         xml = d.to_xml(write_files=True)
         if options.verbose > 2:
             open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
-        parser = etree.XMLParser(no_network=True, huge_tree=True)
-        try:
-            doc = etree.fromstring(xml, parser=parser)
-        except:
-            self.log.warn('Failed to parse XML. Trying to recover')
-            parser = etree.XMLParser(no_network=True, huge_tree=True,
-                    recover=True)
-            doc = etree.fromstring(xml, parser=parser)
+        doc = safe_xml_fromstring(xml)
 
         char_button_map = {}
         for x in doc.xpath('//CharButton[@refobj]'):
@@ -60,7 +54,7 @@ class LRFInput(InputFormatPlugin):
                     plot_map[ro] = imgstr[0].get('file')
 
         self.log('Converting XML to HTML...')
-        styledoc = etree.fromstring(P('templates/lrf.xsl', data=True))
+        styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
         media_type = MediaType()
         styles = Styles()
         text_block = TextBlock(styles, char_button_map, plot_map, log)
diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py
index 6093c5a6c3..d18c18320b 100644
--- a/src/calibre/ebooks/conversion/plugins/rtf_input.py
+++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py
@@ -251,6 +251,7 @@ class RTFInput(InputFormatPlugin):
         from calibre.ebooks.metadata.opf2 import OPFCreator
         from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
         from calibre.ebooks.rtf.input import InlineClass
+        from calibre.utils.xml_parse import safe_xml_fromstring
         self.opts = options
         self.log = log
         self.log('Converting RTF to XML...')
@@ -270,8 +271,7 @@ class RTFInput(InputFormatPlugin):
                 self.log.exception('Failed to extract images...')
 
         self.log('Parsing XML...')
-        parser = etree.XMLParser(recover=True, no_network=True)
-        doc = etree.fromstring(xml, parser=parser)
+        doc = safe_xml_fromstring(xml)
         border_styles = self.convert_borders(doc)
         for pict in doc.xpath('//rtf:pict[@num]',
                 namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
@@ -282,7 +282,7 @@ class RTFInput(InputFormatPlugin):
 
         self.log('Converting XML to HTML...')
         inline_class = InlineClass(self.log)
-        styledoc = etree.fromstring(P('templates/rtf.xsl', data=True))
+        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True))
         extensions = {('calibre', 'inline-class') : inline_class}
         transform = etree.XSLT(styledoc, extensions=extensions)
         result = transform(doc)
diff --git a/src/calibre/ebooks/conversion/plugins/snb_input.py b/src/calibre/ebooks/conversion/plugins/snb_input.py
index 03d213b6e1..23ac302e1c 100644
--- a/src/calibre/ebooks/conversion/plugins/snb_input.py
+++ b/src/calibre/ebooks/conversion/plugins/snb_input.py
@@ -32,10 +32,10 @@ class SNBInput(InputFormatPlugin):
     def convert(self, stream, options, file_ext, log,
                 accelerators):
         import uuid
-        from lxml import etree
 
         from calibre.ebooks.oeb.base import DirContainer
         from calibre.ebooks.snb.snbfile import SNBFile
+        from calibre.utils.xml_parse import safe_xml_fromstring
 
         log.debug("Parsing SNB file...")
         snbFile = SNBFile()
@@ -52,7 +52,7 @@ class SNBInput(InputFormatPlugin):
                 encoding=options.input_encoding, populate=False)
         meta = snbFile.GetFileStream('snbf/book.snbf')
         if meta is not None:
-            meta = etree.fromstring(meta)
+            meta = safe_xml_fromstring(meta)
             l = {'title'    : './/head/name',
                   'creator'  : './/head/author',
                   'language' : './/head/language',
@@ -87,7 +87,7 @@ class SNBInput(InputFormatPlugin):
             toc = snbFile.GetFileStream('snbf/toc.snbf')
             oeb.container = DirContainer(tdir, log)
             if toc is not None:
-                toc = etree.fromstring(toc)
+                toc = safe_xml_fromstring(toc)
                 i = 1
                 for ch in toc.find('.//body'):
                     chapterName = ch.text
@@ -96,7 +96,7 @@ class SNBInput(InputFormatPlugin):
                     data = snbFile.GetFileStream('snbc/' + chapterSrc)
                     if data is None:
                         continue
-                    snbc = etree.fromstring(data)
+                    snbc = safe_xml_fromstring(data)
                     lines = []
                     for line in snbc.find('.//body'):
                         if line.tag == 'text':
diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py
index 8738ba5375..6dd89f3eea 100644
--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@@ -18,11 +18,12 @@ from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.localization import canonicalize_lang
 from calibre.utils.logging import default_log
 from calibre.utils.zipfile import ZipFile
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
 
 
 def fromstring(raw, parser=RECOVER_PARSER):
-    return etree.fromstring(raw, parser=parser)
+    return safe_xml_fromstring(raw)
 
 # Read metadata {{{
 
diff --git a/src/calibre/ebooks/docx/dump.py b/src/calibre/ebooks/docx/dump.py
index 8e04395905..5852482876 100644
--- a/src/calibre/ebooks/docx/dump.py
+++ b/src/calibre/ebooks/docx/dump.py
@@ -11,6 +11,7 @@ from lxml import etree
 
 from calibre import walk
 from calibre.utils.zipfile import ZipFile
+from calibre.utils.xml_parse import safe_xml_fromstring
 
 
 def pretty_all_xml_in_dir(path):
@@ -19,7 +20,7 @@ def pretty_all_xml_in_dir(path):
             with open(f, 'r+b') as stream:
                 raw = stream.read()
                 if raw:
-                    root = etree.fromstring(raw)
+                    root = safe_xml_fromstring(raw)
                     stream.seek(0)
                     stream.truncate()
                     stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index 71573bac8b..d8a0098fb2 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -17,6 +17,7 @@ from lxml import etree
 from calibre import prepare_string_for_xml
 from calibre.constants import __appname__, __version__
 from calibre.utils.localization import lang_as_iso639_1
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.img import save_cover_data_to
 from calibre.ebooks.oeb.base import urlnormalize
 from polyglot.builtins import unicode_type, string_or_bytes, range, filter
@@ -69,7 +70,7 @@ class FB2MLizer(object):
         output = self.clean_text('\n'.join(output))
 
         if self.opts.pretty_print:
-            output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True)
+            output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True)
 
         return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
 
diff --git a/src/calibre/ebooks/metadata/docx.py b/src/calibre/ebooks/metadata/docx.py
index 09f404acb8..7abd0ce9ad 100644
--- a/src/calibre/ebooks/metadata/docx.py
+++ b/src/calibre/ebooks/metadata/docx.py
@@ -8,9 +8,8 @@ __docformat__ = 'restructuredtext en'
 
 from io import BytesIO
 
-from lxml import etree
-
 from calibre.ebooks.docx.container import DOCX
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.docx.writer.container import update_doc_props, xml2str
 from calibre.utils.imghdr import identify
 
@@ -61,11 +60,11 @@ def set_metadata(stream, mi):
         ap_raw = c.read(ap_name)
     except Exception:
         ap_raw = None
-    cp = etree.fromstring(dp_raw)
+    cp = safe_xml_fromstring(dp_raw)
     update_doc_props(cp, mi, c.namespace)
     replacements = {}
     if ap_raw is not None:
-        ap = etree.fromstring(ap_raw)
+        ap = safe_xml_fromstring(ap_raw)
         comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep'])
         for child in tuple(ap):
             if child.tag == comp.tag:
diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py
index 3ebac847f5..b554db6bf7 100644
--- a/src/calibre/ebooks/metadata/epub.py
+++ b/src/calibre/ebooks/metadata/epub.py
@@ -12,13 +12,12 @@ import os
 import posixpath
 from contextlib import closing
 
-from lxml import etree
-
 from calibre import CurrentDir
 from calibre.ebooks.metadata.opf import (
     get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
 )
 from calibre.ebooks.metadata.opf2 import OPF
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.localunzip import LocalZipFile
 from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace
@@ -42,7 +41,7 @@ class Container(dict):
     def __init__(self, stream=None):
         if not stream:
             return
-        container = etree.fromstring(stream.read())
+        container = safe_xml_fromstring(stream.read())
         if container.get('version', None) != '1.0':
             raise EPubException("unsupported version of OCF")
         rootfiles = container.xpath('./*[local-name()="rootfiles"]')
@@ -70,8 +69,7 @@ class Encryption(object):
             'http://www.idpf.org/2008/embedding'])
 
     def __init__(self, raw):
-        from lxml import etree
-        self.root = etree.fromstring(raw) if raw else None
+        self.root = safe_xml_fromstring(raw) if raw else None
         self.entries = {}
         if self.root is not None:
             for em in self.root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py
index 13b39ce2e6..ceaf0479a1 100644
--- a/src/calibre/ebooks/metadata/fb2.py
+++ b/src/calibre/ebooks/metadata/fb2.py
@@ -15,6 +15,7 @@ from lxml import etree
 
 from calibre.utils.date import parse_only_date
 from calibre.utils.img import save_cover_data_to
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.imghdr import identify
 from calibre import guess_type, guess_all_extensions, prints, force_unicode
 from calibre.ebooks.metadata import MetaInformation, check_isbn
@@ -315,9 +316,8 @@ def _parse_language(root, mi, ctx):
 
 
 def _get_fbroot(raw):
-    parser = etree.XMLParser(recover=True, no_network=True)
     raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
-    root = etree.fromstring(raw, parser=parser)
+    root = safe_xml_fromstring(raw)
     return ensure_namespace(root)
 
 
@@ -452,5 +452,5 @@ def ensure_namespace(doc):
         import re
         raw = etree.tostring(doc, encoding='unicode')
         raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
-        doc = etree.fromstring(raw)
+        doc = safe_xml_fromstring(raw)
     return doc
diff --git a/src/calibre/ebooks/metadata/lrx.py b/src/calibre/ebooks/metadata/lrx.py
index 678d05f294..5dc4b742d7 100644
--- a/src/calibre/ebooks/metadata/lrx.py
+++ b/src/calibre/ebooks/metadata/lrx.py
@@ -11,9 +11,9 @@ Read metadata from LRX files
 
 import struct
 from zlib import decompress
-from lxml import etree
 
 from calibre.ebooks.metadata import MetaInformation, string_to_authors
+from calibre.utils.xml_parse import safe_xml_fromstring
 
 
 def _read(f, at, amount):
@@ -66,7 +66,7 @@ def get_metadata(f):
         info = decompress(f.read(compressed_size))
         if len(info) != uncompressed_size:
             raise ValueError('LRX file has malformed metadata section')
-        root = etree.fromstring(info)
+        root = safe_xml_fromstring(info)
         bi = root.find('BookInfo')
         title = bi.find('Title')
         title_sort = title.get('reading', None)
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index f8df51bed4..9aeee85ed7 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -23,6 +23,7 @@ from calibre.utils.localization import get_lang, canonicalize_lang
 from calibre import prints, guess_type
 from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.utils.config import tweaks
+from calibre.utils.xml_parse import safe_xml_fromstring
 from polyglot.builtins import iteritems, unicode_type, getcwd, map
 from polyglot.urllib import unquote, urlparse
 
@@ -1588,7 +1589,7 @@ def metadata_to_opf(mi, as_string=True, default_lang=None):
                 is None else default_lang)
         mi.languages = [lang]
 
-    root = etree.fromstring(textwrap.dedent(
+    root = safe_xml_fromstring(textwrap.dedent(
     '''
     <package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0">
         <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
diff --git a/src/calibre/ebooks/metadata/opf3_test.py b/src/calibre/ebooks/metadata/opf3_test.py
index 0fd67d912b..8b98c1235e 100644
--- a/src/calibre/ebooks/metadata/opf3_test.py
+++ b/src/calibre/ebooks/metadata/opf3_test.py
@@ -7,9 +7,8 @@ from collections import defaultdict
 from io import BytesIO
 import unittest
 
-from lxml import etree
-
 from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.metadata.opf3 import (
     parse_prefixes, reserved_prefixes, expand_prefix, read_identifiers,
@@ -37,7 +36,7 @@ class TestOPF3(unittest.TestCase):
     ae = unittest.TestCase.assertEqual
 
     def get_opf(self, metadata='', manifest=''):
-        return etree.fromstring(TEMPLATE.format(metadata=metadata, manifest=manifest))
+        return safe_xml_fromstring(TEMPLATE.format(metadata=metadata, manifest=manifest))
 
     def test_prefix_parsing(self):  # {{{
         self.ae(parse_prefixes('foaf: http://xmlns.com/foaf/spec/\n dbp: http://dbpedia.org/ontology/'),
@@ -523,7 +522,7 @@ class TestOPF3(unittest.TestCase):
                     self.ae(v2, v3, '%s: %r != %r' % (field, v2, v3))
 
         mi2 = OPF(BytesIO(raw.encode('utf-8'))).to_book_metadata()
-        root = etree.fromstring(raw)
+        root = safe_xml_fromstring(raw)
         root.set('version', '3.0')
         mi3, _, raster_cover, first_spine_item  = read_metadata(root, return_extra_data=True)
         self.assertIsNone(raster_cover)
diff --git a/src/calibre/ebooks/metadata/snb.py b/src/calibre/ebooks/metadata/snb.py
index 12bc843a0f..fc2aa47510 100644
--- a/src/calibre/ebooks/metadata/snb.py
+++ b/src/calibre/ebooks/metadata/snb.py
@@ -9,7 +9,7 @@ import os
 import io
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.snb.snbfile import SNBFile
-from lxml import etree
+from calibre.utils.xml_parse import safe_xml_fromstring
 
 
 def get_metadata(stream, extract_cover=True):
@@ -27,7 +27,7 @@ def get_metadata(stream, extract_cover=True):
         meta = snbFile.GetFileStream('snbf/book.snbf')
 
         if meta is not None:
-            meta = etree.fromstring(meta)
+            meta = safe_xml_fromstring(meta)
             mi.title = meta.find('.//head/name').text
             mi.authors = [meta.find('.//head/author').text]
             mi.language = meta.find('.//head/language').text.lower().replace('_', '-')
diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py
index 4cc2d30c85..bb044f7cca 100644
--- a/src/calibre/ebooks/metadata/sources/douban.py
+++ b/src/calibre/ebooks/metadata/sources/douban.py
@@ -49,7 +49,7 @@ class Douban(Source):
 
     name = 'Douban Books'
     author = 'Li Fanxi'
-    version = (2, 1, 1)
+    version = (2, 1, 2)
     minimum_calibre_version = (2, 80, 0)
 
     description = _('Downloads metadata and covers from Douban.com. '
@@ -119,8 +119,10 @@ class Douban(Source):
         try:
             log.info(id_url)
             raw = get_details(browser, id_url, timeout)
-            feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
-                strip_encoding_pats=True)[0])
+            feed = etree.fromstring(
+                xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
+                parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
+            )
             extra = entry(feed)[0]
         except:
             log.exception('Failed to get additional details for', mi.title)
diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py
index 7853c6153a..7e19add00f 100644
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@@ -105,7 +105,8 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
     try:
         raw = get_details(browser, id_url, timeout)
         feed = etree.fromstring(
-            xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]
+            xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
+            parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
         )
         extra = entry(feed)[0]
     except:
@@ -173,7 +174,7 @@ def to_metadata(browser, log, entry_, timeout):  # {{{
 class GoogleBooks(Source):
 
     name = 'Google'
-    version = (1, 0, 0)
+    version = (1, 0, 1)
     minimum_calibre_version = (2, 80, 0)
     description = _('Downloads metadata and covers from Google Books')
 
@@ -371,10 +372,9 @@ class GoogleBooks(Source):
             return as_unicode(e)
 
         try:
-            parser = etree.XMLParser(recover=True, no_network=True)
             feed = etree.fromstring(
                 xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
-                parser=parser
+                parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
             )
             entries = entry(feed)
         except Exception as e:
diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py
index 1e1968be2a..3803120e4e 100644
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@@ -12,6 +12,7 @@ from lxml.builder import ElementMaker
 
 from calibre.constants import __appname__, __version__
 from calibre.ebooks.chardet import xml_to_unicode
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.cleantext import clean_xml_chars
 from polyglot.builtins import unicode_type, getcwd
 from polyglot.urllib import unquote, urlparse
@@ -177,8 +178,7 @@ class TOC(list):
             with open(toc, 'rb') as f:
                 raw  = xml_to_unicode(f.read(), assume_utf8=True,
                         strip_encoding_pats=True)[0]
-            root = etree.fromstring(raw, parser=etree.XMLParser(recover=True,
-                no_network=True))
+            root = safe_xml_fromstring(raw)
         xpn = {'re': 'http://exslt.org/regular-expressions'}
         XPath = functools.partial(etree.XPath, namespaces=xpn)
 
diff --git a/src/calibre/ebooks/metadata/utils.py b/src/calibre/ebooks/metadata/utils.py
index 7d3a1eaabf..1eb32792ec 100644
--- a/src/calibre/ebooks/metadata/utils.py
+++ b/src/calibre/ebooks/metadata/utils.py
@@ -6,17 +6,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 from collections import namedtuple
 from polyglot.builtins import map
 
-from lxml import etree
 
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.oeb.base import OPF
 from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.spell import parse_lang_code
 from calibre.utils.localization import lang_as_iso639_1
+from calibre.utils.xml_parse import safe_xml_fromstring
 from polyglot.builtins import filter
 
-PARSER = etree.XMLParser(recover=True, no_network=True)
-
 OPFVersion = namedtuple('OPFVersion', 'major minor patch')
 
 
@@ -45,7 +43,7 @@ def parse_opf(stream_or_path):
         raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
     raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
     raw = raw[raw.find('<'):]
-    root = etree.fromstring(raw, PARSER)
+    root = safe_xml_fromstring(raw)
     if root is None:
         raise ValueError('Not an OPF file')
     return root
diff --git a/src/calibre/ebooks/metadata/xmp.py b/src/calibre/ebooks/metadata/xmp.py
index 47bbc409ec..2cc1a5d271 100644
--- a/src/calibre/ebooks/metadata/xmp.py
+++ b/src/calibre/ebooks/metadata/xmp.py
@@ -14,6 +14,7 @@ from lxml.builder import ElementMaker
 
 from calibre import prints
 from calibre.ebooks.metadata import check_isbn, check_doi
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.metadata.opf2 import dump_dict
 from calibre.utils.date import parse_date, isoformat, now
@@ -74,9 +75,9 @@ def parse_xmp_packet(raw_bytes):
             enc = emap.get(m.group(1), enc)
             break
     if enc is None:
-        return etree.fromstring(raw_bytes)
+        return safe_xml_fromstring(raw_bytes)
     raw = _xml_declaration.sub('', raw_bytes.decode(enc))  # lxml barfs if encoding declaration present in unicode string
-    return etree.fromstring(raw)
+    return safe_xml_fromstring(raw)
 
 
 def serialize_xmp_packet(root, encoding='utf-8'):
diff --git a/src/calibre/ebooks/mobi/writer8/toc.py b/src/calibre/ebooks/mobi/writer8/toc.py
index d367226d3a..81246cf586 100644
--- a/src/calibre/ebooks/mobi/writer8/toc.py
+++ b/src/calibre/ebooks/mobi/writer8/toc.py
@@ -6,8 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-from lxml import etree
-
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.oeb.base import (urlnormalize, XPath, XHTML_NS, XHTML,
         XHTML_MIME, css_text)
 
@@ -88,7 +87,7 @@ class TOCAdder(object):
                     'body { font-family: %s }'%s.body_font_family]
             embed_css = '\n\n'.join(css)
 
-        root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS,
+        root = safe_xml_fromstring(TEMPLATE.format(xhtmlns=XHTML_NS,
             title=self.title, embed_css=embed_css,
             extra_css=(opts.extra_css or '')))
         parent = XPath('//h:ul')(root)[0]
diff --git a/src/calibre/ebooks/odt/input.py b/src/calibre/ebooks/odt/input.py
index 6428c3270d..00121cfcb3 100644
--- a/src/calibre/ebooks/odt/input.py
+++ b/src/calibre/ebooks/odt/input.py
@@ -19,6 +19,7 @@ from odf.namespaces import TEXTNS as odTEXTNS
 
 from calibre import CurrentDir, walk
 from calibre.ebooks.oeb.base import _css_logger
+from calibre.utils.xml_parse import safe_xml_fromstring
 from polyglot.builtins import unicode_type, string_or_bytes, filter, getcwd, as_bytes
 
 
@@ -45,7 +46,7 @@ class Extract(ODF2XHTML):
                 ol.set('start', val)
 
     def fix_markup(self, html, log):
-        root = etree.fromstring(html)
+        root = safe_xml_fromstring(html)
         self.filter_css(root, log)
         self.extract_css(root, log)
         self.epubify_markup(root, log)
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 0588257e67..e415d0a84b 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -16,11 +16,11 @@ from lxml import etree, html
 from calibre import force_unicode
 from calibre.constants import filesystem_encoding, __version__, ispy3
 from calibre.translations.dynamic import translate
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.conversion.preprocess import CSSPreProcessor
 from calibre import (isbytestring, as_unicode, get_types_map)
-from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
-        namespace, XHTML, parse_html, NotHTML)
+from calibre.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML
 from calibre.utils.cleantext import clean_xml_chars
 from calibre.utils.short_uuid import uuid4
 from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr
@@ -946,7 +946,7 @@ class Manifest(object):
                 return
             data = xml_to_unicode(data, strip_encoding_pats=True,
                     assume_utf8=True, resolve_entities=True)[0]
-            return etree.fromstring(data, parser=RECOVER_PARSER)
+            return safe_xml_fromstring(data)
 
         def _parse_xhtml(self, data):
             orig_data = data
diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py
index 1f9e7c841b..224c950678 100644
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@@ -11,6 +11,7 @@ import re
 from lxml import etree, html
 
 from calibre import xml_replace_entities, force_unicode
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.constants import filesystem_encoding
 from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
 from polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map
@@ -114,12 +115,7 @@ def _html4_parse(data):
             elem.text = elem.text.strip('-')
     data = etree.tostring(data, encoding='unicode')
 
-    # Setting huge_tree=True causes crashes in windows with large files
-    parser = etree.XMLParser(no_network=True)
-    try:
-        data = etree.fromstring(data, parser=parser)
-    except etree.XMLSyntaxError:
-        data = etree.fromstring(data, parser=RECOVER_PARSER)
+    data = safe_xml_fromstring(data)
     return data
 
 
@@ -210,19 +206,16 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
     data = data.replace('\0', '')
     data = raw = clean_word_doc(data, log)
 
-    # Setting huge_tree=True causes crashes in windows with large files
-    parser = etree.XMLParser(no_network=True)
-
     # Try with more & more drastic measures to parse
     try:
-        data = etree.fromstring(data, parser=parser)
+        data = safe_xml_fromstring(data)
         check_for_html5(pre, data)
     except (HTML5Doc, etree.XMLSyntaxError):
         log.debug('Initial parse failed, using more'
                 ' forgiving parsers')
         raw = data = xml_replace_entities(raw)
         try:
-            data = etree.fromstring(data, parser=parser)
+            data = safe_xml_fromstring(data)
             check_for_html5(pre, data)
         except (HTML5Doc, etree.XMLSyntaxError):
             log.debug('Parsing %s as HTML' % filename)
@@ -251,7 +244,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
         if barename(data.tag) in non_html_file_tags:
             raise NotHTML(data.tag)
         log.warn('File %r does not appear to be (X)HTML'%filename)
-        nroot = etree.fromstring('<html></html>')
+        nroot = safe_xml_fromstring('<html></html>')
         has_body = False
         for child in list(data):
             if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body':
@@ -260,7 +253,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
         parent = nroot
         if not has_body:
             log.warn('File %r appears to be a HTML fragment'%filename)
-            nroot = etree.fromstring('<html><body/></html>')
+            nroot = safe_xml_fromstring('<html><body/></html>')
             parent = nroot[0]
         for child in list(data.iter()):
             oparent = child.getparent()
@@ -276,12 +269,12 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
         data = etree.tostring(data, encoding='unicode')
 
         try:
-            data = etree.fromstring(data, parser=parser)
+            data = safe_xml_fromstring(data)
         except:
             data = data.replace(':=', '=').replace(':>', '>')
             data = data.replace('<http:/>', '')
             try:
-                data = etree.fromstring(data, parser=parser)
+                data = safe_xml_fromstring(data)
             except etree.XMLSyntaxError:
                 log.warn('Stripping comments from %s'%
                         filename)
@@ -292,12 +285,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
                     '')
                 data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
                 try:
-                    data = etree.fromstring(data,
-                            parser=RECOVER_PARSER)
+                    data = safe_xml_fromstring(data)
                 except etree.XMLSyntaxError:
                     log.warn('Stripping meta tags from %s'% filename)
                     data = re.sub(r'<meta\s+[^>]+?>', '', data)
-                    data = etree.fromstring(data, parser=RECOVER_PARSER)
+                    data = safe_xml_fromstring(data)
     elif namespace(data.tag) != XHTML_NS:
         # OEB_DOC_NS, but possibly others
         ns = namespace(data.tag)
diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py
index 34c24372a5..66c91418dd 100644
--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@@ -7,11 +7,12 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
 import re
 
-from lxml.etree import XMLParser, fromstring, XMLSyntaxError
+from lxml.etree import XMLSyntaxError
 import css_parser
 
 from calibre import force_unicode, human_readable, prepare_string_for_xml
 from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.html_entities import html5_entities
 from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
 from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
@@ -276,7 +277,6 @@ def check_xml_parsing(name, mt, raw):
     # Get rid of entities as named entities trip up the XML parser
     eproc = EntitityProcessor(mt)
     eraw = entity_pat.sub(eproc, raw)
-    parser = XMLParser(recover=False)
     errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
     errors = []
     if eproc.ok_named_entities:
@@ -288,7 +288,7 @@ def check_xml_parsing(name, mt, raw):
             errors.append(BadEntity(ent, name, lnum, col))
 
     try:
-        root = fromstring(eraw, parser=parser)
+        root = safe_xml_fromstring(eraw, recover=False)
     except UnicodeDecodeError:
         return errors + [DecodeError(name)]
     except XMLSyntaxError as err:
diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index 1cadb8986c..aba74c297b 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -18,7 +18,6 @@ from io import BytesIO
 from itertools import count
 
 from css_parser import getUrls, replaceUrls
-from lxml import etree
 
 from calibre import CurrentDir, walk
 from calibre.constants import iswindows
@@ -42,7 +41,7 @@ from calibre.ebooks.oeb.base import (
     DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks,
     rewrite_links, serialize, urlquote, urlunquote
 )
-from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER, NotHTML, parse_html
+from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html
 from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook
 from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
 from calibre.ebooks.oeb.polish.utils import (
@@ -52,6 +51,7 @@ from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryF
 from calibre.utils.filenames import hardlink_file, nlinks_file
 from calibre.utils.ipc.simple_worker import WorkerError, fork_job
 from calibre.utils.logging import default_log
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.zipfile import ZipFile
 from polyglot.builtins import iteritems, map, unicode_type, zip
 from polyglot.urllib import urlparse
@@ -201,7 +201,7 @@ class ContainerBase(object):  # {{{
         data, self.used_encoding = xml_to_unicode(
             data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
         data = unicodedata.normalize('NFC', data)
-        return etree.fromstring(data, parser=RECOVER_PARSER)
+        return safe_xml_fromstring(data)
 
     def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
         if self.tweak_mode:
@@ -1178,7 +1178,7 @@ class EpubContainer(Container):
         container_path = join(self.root, 'META-INF', 'container.xml')
         if not exists(container_path):
             raise InvalidEpub('No META-INF/container.xml in epub')
-        container = etree.fromstring(open(container_path, 'rb').read())
+        container = safe_xml_fromstring(open(container_path, 'rb').read())
         opf_files = container.xpath((
             r'child::ocf:rootfiles/ocf:rootfile'
             '[@media-type="%s" and @full-path]'%guess_type('a.opf')
diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index 1b94dce1aa..cac8e307a3 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -7,10 +7,11 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
 import re
 
-from lxml.etree import XMLParser, fromstring, Element as LxmlElement
+from lxml.etree import Element as LxmlElement
 import html5_parser
 
 from calibre import xml_replace_entities
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
 from calibre.utils.cleantext import clean_xml_chars
 from polyglot.builtins import unicode_type
@@ -77,8 +78,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
     if force_html5_parse:
         return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
     try:
-        parser = XMLParser(no_network=True)
-        ans = fromstring(raw, parser=parser)
+        ans = safe_xml_fromstring(raw)
         if ans.tag != '{%s}html' % XHTML_NS:
             raise ValueError('Root tag is not <html> in the XHTML namespace')
         if linenumber_attribute:
diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py
index bcdade1eec..2c248a8c1c 100644
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@@ -21,6 +21,7 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
                                     urlnormalize, BINARY_MIME, \
                                     OEBError, OEBBook, DirContainer
 from calibre.ebooks.oeb.writer import OEBWriter
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.cleantext import clean_xml_chars
 from calibre.utils.localization import get_lang
 from calibre.ptempfile import TemporaryDirectory
@@ -108,23 +109,18 @@ class OEBReader(object):
         data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
                 OPF1_NS, data)
         try:
-            opf = etree.fromstring(data)
+            opf = safe_xml_fromstring(data)
         except etree.XMLSyntaxError:
             data = xml_replace_entities(clean_xml_chars(data), encoding=None)
             try:
-                opf = etree.fromstring(data)
+                opf = safe_xml_fromstring(data)
                 self.logger.warn('OPF contains invalid HTML named entities')
             except etree.XMLSyntaxError:
                 data = re.sub(r'(?is)<tours>.+</tours>', '', data)
                 data = data.replace('<dc-metadata>',
                     '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
-                try:
-                    opf = etree.fromstring(data)
-                    self.logger.warn('OPF contains invalid tours section')
-                except etree.XMLSyntaxError:
-                    self.logger.warn('OPF contains invalid markup, trying to parse it anyway')
-                    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
-                    opf = etree.fromstring(data, parser=RECOVER_PARSER)
+                opf = safe_xml_fromstring(data)
+                self.logger.warn('OPF contains invalid tours section')
 
         ns = namespace(opf.tag)
         if ns not in ('', OPF1_NS, OPF2_NS):
diff --git a/src/calibre/ebooks/oeb/transforms/cover.py b/src/calibre/ebooks/oeb/transforms/cover.py
index 8fbb7bcc24..4409f0cf83 100644
--- a/src/calibre/ebooks/oeb/transforms/cover.py
+++ b/src/calibre/ebooks/oeb/transforms/cover.py
@@ -8,9 +8,9 @@ __docformat__ = 'restructuredtext en'
 
 import textwrap
 
-from lxml import etree
 from calibre import guess_type
 from calibre.utils.imghdr import identify
+from calibre.utils.xml_parse import safe_xml_fromstring
 from polyglot.builtins import unicode_type
 from polyglot.urllib import unquote
 
@@ -156,7 +156,7 @@ class CoverManager(object):
                 tp = templ%unquote(href)
                 id, href = m.generate('titlepage', 'titlepage.xhtml')
                 item = m.add(id, href, guess_type('t.xhtml')[0],
-                        data=etree.fromstring(tp))
+                        data=safe_xml_fromstring(tp))
         else:
             item = self.oeb.manifest.hrefs[
                     urldefrag(self.oeb.guide['titlepage'].href)[0]]
diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py
index 57dce5b42b..9df116bd6f 100644
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@@ -129,9 +129,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
 
 def parse_outline(raw, output_dir):
     from lxml import etree
-    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
+    from calibre.utils.xml_parse import safe_xml_fromstring
     raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
-    outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]')
+    outline = safe_xml_fromstring(raw).xpath('(//outline)[1]')
     if outline:
         from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
         outline = outline[0]
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 780a1e2556..8987672322 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -12,6 +12,7 @@ from itertools import count
 from lxml import etree
 
 from polyglot.builtins import range, map
+from calibre.utils.xml_parse import safe_xml_fromstring
 
 
 class Font(object):
@@ -622,8 +623,7 @@ class PDFDocument(object):
 
     def __init__(self, xml, opts, log):
         self.opts, self.log = opts, log
-        parser = etree.XMLParser(recover=True)
-        self.root = etree.fromstring(xml, parser=parser)
+        self.root = safe_xml_fromstring(xml)
         idc = count()
 
         self.fonts = []
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index 06af92cd83..6474656b94 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -14,6 +14,7 @@ import re
 from lxml import etree
 
 from calibre.ebooks.pdb.ereader import image_name
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.pml import unipmlcode
 from polyglot.builtins import unicode_type, string_or_bytes
 
@@ -138,7 +139,7 @@ class PMLMLizer(object):
             self.log.debug('Converting %s to PML markup...' % item.href)
             content = etree.tostring(item.data, encoding='unicode')
             content = self.prepare_text(content)
-            content = etree.fromstring(content)
+            content = safe_xml_fromstring(content)
             stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
             text.append(self.add_page_anchor(item))
             text += self.dump_text(content.find(XHTML('body')), stylizer, item)
diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py
index d3814d8228..9098038f32 100644
--- a/src/calibre/ebooks/rtf/rtfml.py
+++ b/src/calibre/ebooks/rtf/rtfml.py
@@ -109,6 +109,7 @@ class RTFMLizer(object):
     def mlize_spine(self):
         from calibre.ebooks.oeb.base import XHTML
         from calibre.ebooks.oeb.stylizer import Stylizer
+        from calibre.utils.xml_parse import safe_xml_fromstring
         output = self.header()
         if 'titlepage' in self.oeb_book.guide:
             href = self.oeb_book.guide['titlepage'].href
@@ -126,7 +127,7 @@ class RTFMLizer(object):
             content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
             content = self.remove_newlines(content)
             content = self.remove_tabs(content)
-            content = etree.fromstring(content)
+            content = safe_xml_fromstring(content)
             stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
             self.currently_dumping_item = item
             output += self.dump_text(content.find(XHTML('body')), stylizer)
diff --git a/src/calibre/ebooks/snb/snbml.py b/src/calibre/ebooks/snb/snbml.py
index c93d3d3701..4936587ae5 100644
--- a/src/calibre/ebooks/snb/snbml.py
+++ b/src/calibre/ebooks/snb/snbml.py
@@ -84,6 +84,7 @@ class SNBMLizer(object):
     def mlize(self):
         from calibre.ebooks.oeb.base import XHTML
         from calibre.ebooks.oeb.stylizer import Stylizer
+        from calibre.utils.xml_parse import safe_xml_fromstring
         output = [u'']
         stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
         content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode')
@@ -98,7 +99,7 @@ class SNBMLizer(object):
             etree.SubElement(snbcTree, "body")
             trees[subitem] = snbcTree
         output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, ""))
-        output += self.dump_text(self.subitems, etree.fromstring(content), stylizer)[0]
+        output += self.dump_text(self.subitems, safe_xml_fromstring(content), stylizer)[0]
         output = self.cleanup_text(''.join(output))
 
         subitem = ''
diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index f2abd4a0d5..14cc7d8c5f 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -67,6 +67,7 @@ class TXTMLizer(object):
     def mlize_spine(self):
         from calibre.ebooks.oeb.base import XHTML
         from calibre.ebooks.oeb.stylizer import Stylizer
+        from calibre.utils.xml_parse import safe_xml_fromstring
         output = [u'']
         output.append(self.get_toc())
         for item in self.oeb_book.spine:
@@ -76,7 +77,7 @@ class TXTMLizer(object):
                     x.text = x.text.replace('--', '__')
             content = etree.tostring(item.data, encoding='unicode')
             content = self.remove_newlines(content)
-            content = etree.fromstring(content)
+            content = safe_xml_fromstring(content)
             stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
             output += self.dump_text(content.find(XHTML('body')), stylizer, item)
             output += '\n\n\n\n\n\n'
diff --git a/src/calibre/gui2/dialogs/opml.py b/src/calibre/gui2/dialogs/opml.py
index b571d1dbe5..245d252162 100644
--- a/src/calibre/gui2/dialogs/opml.py
+++ b/src/calibre/gui2/dialogs/opml.py
@@ -15,6 +15,7 @@ from PyQt5.Qt import (
 from lxml import etree
 
 from calibre.gui2 import choose_files, error_dialog
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.icu import sort_key
 from polyglot.builtins import unicode_type
 
@@ -32,7 +33,7 @@ def uniq(vals, kmap=lambda x:x):
 
 
 def import_opml(raw, preserve_groups=True):
-    root = etree.fromstring(raw)
+    root = safe_xml_fromstring(raw)
     groups = defaultdict(list)
     ax = etree.XPath('ancestor::outline[@title or @text]')
     for outline in root.xpath('//outline[@type="rss" and @xmlUrl]'):
diff --git a/src/calibre/gui2/store/opensearch_store.py b/src/calibre/gui2/store/opensearch_store.py
index 5acf124455..d51e36514e 100644
--- a/src/calibre/gui2/store/opensearch_store.py
+++ b/src/calibre/gui2/store/opensearch_store.py
@@ -8,12 +8,11 @@ __docformat__ = 'restructuredtext en'
 
 from contextlib import closing
 
-from lxml import etree
-
 from PyQt5.Qt import QUrl
 
 from calibre import (browser, guess_extension)
 from calibre.gui2 import open_url
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.search_result import SearchResult
 from calibre.gui2.store.web_store_dialog import WebStoreDialog
@@ -36,7 +35,7 @@ def open_search(url, query, max_results=10, timeout=60):
     counter = max_results
     br = browser()
     with closing(br.open(url, timeout=timeout)) as f:
-        doc = etree.fromstring(f.read())
+        doc = safe_xml_fromstring(f.read())
         for data in doc.xpath('//*[local-name() = "entry"]'):
             if counter <= 0:
                 break
diff --git a/src/calibre/gui2/store/stores/gutenberg_plugin.py b/src/calibre/gui2/store/stores/gutenberg_plugin.py
index 288329b7e5..62852eeb79 100644
--- a/src/calibre/gui2/store/stores/gutenberg_plugin.py
+++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-store_version = 5  # Needed for dynamic plugin loading
+store_version = 6  # Needed for dynamic plugin loading
 
 __license__ = 'GPL 3'
 __copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
@@ -43,7 +43,7 @@ def search(query, max_results=10, timeout=60, write_raw_to=None):
         if write_raw_to is not None:
             with open(write_raw_to, 'wb') as f:
                 f.write(raw)
-        doc = etree.fromstring(raw)
+        doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
         for data in doc.xpath('//*[local-name() = "entry"]'):
             if counter <= 0:
                 break
@@ -63,7 +63,7 @@ def search(query, max_results=10, timeout=60, write_raw_to=None):
 
             # Get the formats and direct download links.
             with closing(br.open(id, timeout=timeout/4)) as nf:
-                ndoc = etree.fromstring(nf.read())
+                ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
                 for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
                     type = link.get('type')
                     href = link.get('href')
diff --git a/src/calibre/gui2/store/stores/litres_plugin.py b/src/calibre/gui2/store/stores/litres_plugin.py
index 150473b049..128cbbcf2d 100644
--- a/src/calibre/gui2/store/stores/litres_plugin.py
+++ b/src/calibre/gui2/store/stores/litres_plugin.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-store_version = 1  # Needed for dynamic plugin loading
+store_version = 2  # Needed for dynamic plugin loading
 
 __license__ = 'GPL 3'
 __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
@@ -63,8 +63,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
             ungzipResponse(r,br)
             raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
 
-            parser = etree.XMLParser(recover=True, no_network=True)
-            doc = etree.fromstring(raw, parser=parser)
+            doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
             for data in doc.xpath('//*[local-name() = "fb2-book"]'):
                 if counter <= 0:
                     break
diff --git a/src/calibre/gui2/store/stores/manybooks_plugin.py b/src/calibre/gui2/store/stores/manybooks_plugin.py
index 5087673310..ec174949d5 100644
--- a/src/calibre/gui2/store/stores/manybooks_plugin.py
+++ b/src/calibre/gui2/store/stores/manybooks_plugin.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-store_version = 1  # Needed for dynamic plugin loading
+store_version = 2  # Needed for dynamic plugin loading
 
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@@ -46,7 +46,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
     with closing(br.open(url, timeout=timeout)) as f:
         raw_data = f.read()
         raw_data = raw_data.decode('utf-8', 'replace')
-        doc = etree.fromstring(raw_data)
+        doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
         for data in doc.xpath('//*[local-name() = "entry"]'):
             if counter <= 0:
                 break
@@ -71,7 +71,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
 
             # Follow the detail link to get the rest of the info.
             with closing(br.open(detail_href, timeout=timeout/4)) as df:
-                ddoc = etree.fromstring(df.read())
+                ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
                 ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
                 if ddata:
                     ddata = ddata[0]
diff --git a/src/calibre/gui2/store/stores/xinxii_plugin.py b/src/calibre/gui2/store/stores/xinxii_plugin.py
index ae7ff386d5..bb2324f4ab 100644
--- a/src/calibre/gui2/store/stores/xinxii_plugin.py
+++ b/src/calibre/gui2/store/stores/xinxii_plugin.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-store_version = 1  # Needed for dynamic plugin loading
+store_version = 2  # Needed for dynamic plugin loading
 
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@@ -47,7 +47,7 @@ class XinXiiStore(BasicStoreConfig, OpenSearchOPDSStore):
         counter = max_results
         br = browser()
         with closing(br.open(url, timeout=timeout)) as f:
-            doc = etree.fromstring(f.read())
+            doc = etree.fromstring(f.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
             for data in doc.xpath('//*[local-name() = "entry"]'):
                 if counter <= 0:
                     break
diff --git a/src/calibre/gui2/tweak_book/diff/view.py b/src/calibre/gui2/tweak_book/diff/view.py
index 7fb2389cc4..3963666322 100644
--- a/src/calibre/gui2/tweak_book/diff/view.py
+++ b/src/calibre/gui2/tweak_book/diff/view.py
@@ -28,6 +28,7 @@ from calibre.gui2.tweak_book.editor.text import PlainTextEdit, default_font_fami
 from calibre.gui2.tweak_book.editor.themes import theme_color, get_theme
 from calibre.gui2.tweak_book.diff import get_sequence_matcher
 from calibre.gui2.tweak_book.diff.highlight import get_highlighter
+from calibre.utils.xml_parse import safe_xml_fromstring
 
 Change = namedtuple('Change', 'ltop lbot rtop rbot kind')
 
@@ -47,7 +48,7 @@ def beautify_text(raw, syntax):
     from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
     from calibre.ebooks.chardet import strip_encoding_declarations
     if syntax == 'xml':
-        root = etree.fromstring(strip_encoding_declarations(raw))
+        root = safe_xml_fromstring(strip_encoding_declarations(raw))
         pretty_xml_tree(root)
     elif syntax == 'css':
         import logging
diff --git a/src/calibre/library/catalogs/epub_mobi_builder.py b/src/calibre/library/catalogs/epub_mobi_builder.py
index eb12fe9b2d..2ffe46a6bf 100644
--- a/src/calibre/library/catalogs/epub_mobi_builder.py
+++ b/src/calibre/library/catalogs/epub_mobi_builder.py
@@ -21,6 +21,7 @@ from calibre import (
     replace_entities, strftime, xml_replace_entities
 )
 from calibre.constants import cache_dir, isosx
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.customize.conversion import DummyReporter
 from calibre.customize.ui import output_profiles
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
@@ -2992,7 +2993,7 @@ class CatalogBuilder(object):
             <navMap/>
             </ncx>
         '''
-        root = self.ncx_root = etree.fromstring(header)
+        root = self.ncx_root = safe_xml_fromstring(header)
         navMapTag = root[0]
 
         if self.generate_for_kindle_mobi:
@@ -3668,7 +3669,7 @@ class CatalogBuilder(object):
                 lang=prepare_string_for_xml(lang),
                 pt="periodical:default" if self.generate_for_kindle_mobi else ""
         )
-        root = etree.fromstring(header)
+        root = safe_xml_fromstring(header)
         manifest = root.xpath('//*[local-name()="manifest"]')[0]
         spine = root.xpath('//*[local-name()="spine"]')[0]
         guide = root.xpath('//*[local-name()="guide"]')[0]
diff --git a/src/calibre/spell/import_from.py b/src/calibre/spell/import_from.py
index f189335108..b2c54d43ea 100644
--- a/src/calibre/spell/import_from.py
+++ b/src/calibre/spell/import_from.py
@@ -10,6 +10,7 @@ import sys, glob, os, tempfile, re, codecs
 from lxml import etree
 
 from calibre.constants import config_dir
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.zipfile import ZipFile
 from polyglot.builtins import iteritems
 
@@ -26,7 +27,7 @@ BUILTIN_LOCALES = {'en-US', 'en-GB', 'es-ES'}
 def parse_xcu(raw, origin='%origin%'):
     ' Get the dictionary and affix file names as well as supported locales for each dictionary '
     ans = {}
-    root = etree.fromstring(raw)
+    root = safe_xml_fromstring(raw)
 
     for node in XPath('//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')(root):
         value = XPath('descendant::prop[@oor:name="Locations"]/value')(node)
@@ -123,7 +124,7 @@ def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
                     key = key[3:]
                 return zf.open(key.lstrip('/')).read()
 
-        root = etree.fromstring(zf.open('META-INF/manifest.xml').read())
+        root = safe_xml_fromstring(zf.open('META-INF/manifest.xml').read())
         xcu = XPath('//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]')(root)[0].get(
             '{%s}full-path' % NS_MAP['manifest'])
         for (dic, aff), locales in iteritems(parse_xcu(zf.open(xcu).read(), origin='')):
diff --git a/src/calibre/srv/opds.py b/src/calibre/srv/opds.py
index 59f3e75612..24f38d8c75 100644
--- a/src/calibre/srv/opds.py
+++ b/src/calibre/srv/opds.py
@@ -15,6 +15,7 @@ from lxml.builder import ElementMaker
 
 from calibre.constants import __appname__
 from calibre.db.view import sanitize_sort_field_name
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.ebooks.metadata import fmt_sidx, authors_to_string, rating_to_stars
 from calibre.library.comments import comments_to_html
 from calibre import guess_type, prepare_string_for_xml as xml
@@ -123,7 +124,7 @@ def html_to_lxml(raw):
     root.set('xmlns', "http://www.w3.org/1999/xhtml")
     raw = etree.tostring(root, encoding=None)
     try:
-        return etree.fromstring(raw)
+        return safe_xml_fromstring(raw)
     except:
         for x in root.iterdescendants():
             remove = []
@@ -134,7 +135,7 @@ def html_to_lxml(raw):
                 del x.attrib[a]
         raw = etree.tostring(root, encoding=None)
         try:
-            return etree.fromstring(raw)
+            return safe_xml_fromstring(raw)
         except:
             from calibre.ebooks.oeb.parse_utils import _html4_parse
             return _html4_parse(raw)
diff --git a/src/calibre/utils/opensearch/description.py b/src/calibre/utils/opensearch/description.py
index f931448054..c5f90671ab 100644
--- a/src/calibre/utils/opensearch/description.py
+++ b/src/calibre/utils/opensearch/description.py
@@ -11,9 +11,8 @@ __docformat__ = 'restructuredtext en'
 
 from contextlib import closing
 
-from lxml import etree
-
 from calibre import browser
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.utils.opensearch.url import URL
 
 
@@ -38,7 +37,7 @@ class Description(object):
         '''
         br = browser()
         with closing(br.open(url, timeout=15)) as f:
-            doc = etree.fromstring(f.read())
+            doc = safe_xml_fromstring(f.read())
 
         # version 1.1 has repeating Url elements.
         self.urls = []
diff --git a/src/calibre/utils/xml_parse.py b/src/calibre/utils/xml_parse.py
new file mode 100644
index 0000000000..a82d4bc773
--- /dev/null
+++ b/src/calibre/utils/xml_parse.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+from lxml import etree
+
+# resolve_entities is turned off as entities can cause
+# reads of local files, for example:
+# <!DOCTYPE foo [ <!ENTITY passwd SYSTEM "file:///etc/passwd" >]>
+SAFE_XML_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
+SAFE_XML_PARSER_NO_RECOVER = etree.XMLParser(recover=False, no_network=True, resolve_entities=False)
+fs = etree.fromstring
+
+
+def safe_xml_fromstring(string_or_bytes, recover=True):
+    return fs(string_or_bytes, SAFE_XML_PARSER if recover else SAFE_XML_PARSER_NO_RECOVER)
diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py
index c6dbef2065..5c81fae1c1 100644
--- a/src/calibre/web/feeds/recipes/collection.py
+++ b/src/calibre/web/feeds/recipes/collection.py
@@ -14,6 +14,7 @@ from lxml import etree
 from lxml.builder import ElementMaker
 
 from calibre import force_unicode
+from calibre.utils.xml_parse import safe_xml_fromstring
 from calibre.constants import numeric_version
 from calibre.utils.iso8601 import parse_iso8601
 from calibre.utils.date import now as nowf, utcnow, local_tz, isoformat, EPOCH, UNDEFINED_DATE
@@ -124,7 +125,7 @@ def get_custom_recipe_collection(*args):
             import traceback
             traceback.print_exc()
             continue
-    return etree.fromstring(serialize_collection(rmap))
+    return safe_xml_fromstring(serialize_collection(rmap))
 
 
 def update_custom_recipe(id_, title, script):
@@ -287,7 +288,7 @@ class SchedulerConfig(object):
         if os.access(self.conf_path, os.R_OK):
             with ExclusiveFile(self.conf_path) as f:
                 try:
-                    self.root = etree.fromstring(f.read())
+                    self.root = safe_xml_fromstring(f.read())
                 except:
                     print('Failed to read recipe scheduler config')
                     import traceback