mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Fix html5 conversion parser not handling ascii control chars
This commit is contained in:
parent
62d042d9d4
commit
d3352aeec9
@ -91,7 +91,11 @@ def html5_parse(data, max_nesting_depth=100):
|
|||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter('ignore')
|
warnings.simplefilter('ignore')
|
||||||
|
try:
|
||||||
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
||||||
|
except ValueError:
|
||||||
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
data = html5lib.parse(clean_xml_chars(data), treebuilder='lxml').getroot()
|
||||||
|
|
||||||
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
||||||
# insane nesting depths
|
# insane nesting depths
|
||||||
|
@ -94,8 +94,10 @@ def space_characters(test, parse_function):
|
|||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
|
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
|
||||||
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
||||||
markup = '<html><p>\u000b\u000c</p>'
|
markup = '<html><p>a\u000b\u000c</p>'
|
||||||
root = parse_function(markup) # Should strip non XML safe control code \u000b
|
root = parse_function(markup) # Should strip non XML safe control code \u000b
|
||||||
|
test.assertNotIn('\u000b', root.xpath('//*[local-name()="p"]')[0].text, err)
|
||||||
|
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
||||||
|
|
||||||
def case_insensitive_element_names(test, parse_function):
|
def case_insensitive_element_names(test, parse_function):
|
||||||
markup = '<HTML><P> </p>'
|
markup = '<HTML><P> </p>'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user