diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index 7ee9f5131f..1bb13d21a2 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -91,7 +91,11 @@ def html5_parse(data, max_nesting_depth=100): with warnings.catch_warnings(): warnings.simplefilter('ignore') - data = html5lib.parse(data, treebuilder='lxml').getroot() + try: + data = html5lib.parse(data, treebuilder='lxml').getroot() + except ValueError: + from calibre.utils.cleantext import clean_xml_chars + data = html5lib.parse(clean_xml_chars(data), treebuilder='lxml').getroot() # Check that the asinine HTML 5 algorithm did not result in a tree with # insane nesting depths diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py index ce5d18b494..2bc2dff96f 100644 --- a/src/calibre/ebooks/oeb/polish/tests/parsing.py +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -94,8 +94,10 @@ def space_characters(test, parse_function): root = parse_function(markup) err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root) test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err) - markup = '

\u000b\u000c

' + markup = '

a\u000b\u000c

' root = parse_function(markup) # Should strip non XML safe control code \u000b + test.assertNotIn('\u000b', root.xpath('//*[local-name()="p"]')[0].text, err) + test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err) def case_insensitive_element_names(test, parse_function): markup = '

'