Better workaround for XXE in lxml

Instead of disabling resolution of all entities, only disable resolution
of entities from files
This commit is contained in:
Kovid Goyal 2020-01-08 22:02:23 +05:30
parent f686647286
commit f0e6ead4c7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 50 additions and 14 deletions

View File

@ -135,6 +135,8 @@ def find_tests(which_tests=None):
a(find_tests()) a(find_tests())
from calibre.utils.hyphenation.test_hyphenation import find_tests from calibre.utils.hyphenation.test_hyphenation import find_tests
a(find_tests()) a(find_tests())
from calibre.utils.xml_parse import find_tests
a(find_tests())
if iswindows: if iswindows:
from calibre.utils.windows.wintest import find_tests from calibre.utils.windows.wintest import find_tests
a(find_tests()) a(find_tests())

View File

@ -5,27 +5,61 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
from lxml import etree from lxml import etree
import threading
# resolve_entities is turned off as entities can cause # resolving of SYSTEM entitties is turned off as entities can cause
# reads of local files, for example: # reads of local files, for example:
# <!DOCTYPE foo [ <!ENTITY passwd SYSTEM "file:///etc/passwd" >]> # <!DOCTYPE foo [ <!ENTITY passwd SYSTEM "file:///etc/passwd" >]>
_global_tls = threading.local()
fs = etree.fromstring fs = etree.fromstring
def parser(recover): class Resolver(etree.Resolver):
parsers = getattr(_global_tls, 'parsers', None)
if parsers is None: def resolve(self, url, id, context):
_global_tls.parsers = parsers = { return self.resolve_string('', context)
True:
etree.XMLParser(recover=True, no_network=True, resolve_entities=False),
False: def create_parser(recover):
etree.XMLParser(recover=False, no_network=True, resolve_entities=False) parser = etree.XMLParser(recover=recover, no_network=True)
} parser.resolvers.add(Resolver())
return parsers[recover] return parser
def safe_xml_fromstring(string_or_bytes, recover=True): def safe_xml_fromstring(string_or_bytes, recover=True):
return fs(string_or_bytes, parser(recover)) return fs(string_or_bytes, parser=create_parser(recover))
def find_tests():
import unittest, tempfile, os
class TestXMLParse(unittest.TestCase):
def setUp(self):
with tempfile.NamedTemporaryFile(delete=False) as tf:
tf.write(b'external')
self.temp_file = tf.name
def tearDown(self):
os.remove(self.temp_file)
def test_safe_xml_fromstring(self):
templ = '''<!DOCTYPE foo [ <!ENTITY e {id} "{val}" > ]><r>&e;</r>'''
external = 'file:///' + self.temp_file
self.assertEqual(etree.fromstring(templ.format(id='SYSTEM', val=external)).text, 'external')
for eid, val, expected in (
('', 'normal entity', 'normal entity'),
('SYSTEM', external, None),
('SYSTEM', 'http://example.com', None),
('', external, external),
('PUBLIC', external, None),
('PUBLIC', 'http://example.com', None),
):
got = safe_xml_fromstring(templ.format(id=eid, val=val)).text
self.assertEqual(got, expected)
return unittest.defaultTestLoader.loadTestsFromTestCase(TestXMLParse)
if __name__ == '__main__':
from calibre.utils.run_tests import run_tests
run_tests(find_tests)