mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Test framework for tag soup parsing
Also fix a couple of minor bugs in the current html 5 parser (self closed <style/>, <script/>, <textarea/> tags and re-mapped namespace prefixes.
This commit is contained in:
parent
18d5d9a3a4
commit
b4bf871077
@ -88,7 +88,7 @@ self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b
|
||||
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
|
||||
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
|
||||
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
|
||||
'video'}
|
||||
'video', 'title', 'script', 'style'}
|
||||
|
||||
_self_closing_pat = re.compile(
|
||||
r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(self_closing_bad_tags)),
|
||||
|
@ -83,7 +83,7 @@ def node_depth(node):
|
||||
def html5_parse(data, max_nesting_depth=100):
|
||||
import html5lib
|
||||
# html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195
|
||||
data = re.sub(r'<\s*title\s*[^>]*/\s*>', '<title></title>', data)
|
||||
data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1></\1>', data, flags=re.I)
|
||||
|
||||
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
||||
|
||||
@ -116,6 +116,7 @@ def html5_parse(data, max_nesting_depth=100):
|
||||
prefix = x[11:]
|
||||
namespaces[prefix] = val
|
||||
|
||||
remapped_namespaces = {}
|
||||
if namespaces:
|
||||
# Some destroyed namespace declarations were found
|
||||
p = elem.getparent()
|
||||
@ -127,6 +128,7 @@ def html5_parse(data, max_nesting_depth=100):
|
||||
p.remove(elem)
|
||||
elem = clone_element(elem, nsmap=namespaces)
|
||||
p.insert(idx, elem)
|
||||
remapped_namespaces = {ns:namespaces[ns] for ns in set(namespaces) - set(elem.nsmap)}
|
||||
|
||||
b = barename(elem.tag)
|
||||
idx = b.find('U0003A')
|
||||
@ -135,6 +137,8 @@ def html5_parse(data, max_nesting_depth=100):
|
||||
ns = elem.nsmap.get(prefix, None)
|
||||
if ns is None:
|
||||
ns = non_html5_namespaces.get(prefix, None)
|
||||
if ns is None:
|
||||
ns = remapped_namespaces.get(prefix, None)
|
||||
if ns is not None:
|
||||
elem.tag = '{%s}%s'%(ns, tag)
|
||||
|
||||
@ -145,6 +149,8 @@ def html5_parse(data, max_nesting_depth=100):
|
||||
ns = elem.nsmap.get(prefix, None)
|
||||
if ns is None:
|
||||
ns = non_html5_namespaces.get(prefix, None)
|
||||
if ns is None:
|
||||
ns = remapped_namespaces.get(prefix, None)
|
||||
if ns is not None:
|
||||
elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)
|
||||
|
||||
|
80
src/calibre/ebooks/oeb/polish/tests/parsing.py
Normal file
80
src/calibre/ebooks/oeb/polish/tests/parsing.py
Normal file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
|
||||
from calibre.ebooks.oeb.parse_utils import html5_parse
|
||||
|
||||
def nonvoid_cdata_elements(test, parse_function):
|
||||
''' If self closed version of non-void cdata elements like <title/> are
|
||||
present, the HTML5 parsing algorithm treats all following data as CDATA '''
|
||||
markup = '''
|
||||
<html> <head><{0}/></head> <body id="test"> </html>
|
||||
'''
|
||||
for tag in ('title', 'style', 'script', 'textarea'):
|
||||
for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
|
||||
root = parse_function(markup.format(x))
|
||||
test.assertEqual(
|
||||
len(XPath('//h:body[@id="test"]')(root)), 1,
|
||||
'Incorrect parsing for <%s/>, parsed markup:\n' % x + etree.tostring(root))
|
||||
|
||||
def namespaces(test, parse_function):
|
||||
ae = test.assertEqual
|
||||
markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS)
|
||||
root = parse_function(markup)
|
||||
ae(
|
||||
len(XPath('//h:body[@id="test"]')(root)), 1,
|
||||
'Incorrect parsing, parsed markup:\n' + etree.tostring(root))
|
||||
|
||||
markup = '''
|
||||
<html xmlns="{xhtml}"><head><body id="test">
|
||||
<svg:svg xmlns:svg="{svg}"><svg:image xmlns:xlink="{xlink}" xlink:href="xxx"/></svg:svg>
|
||||
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
|
||||
root = parse_function(markup)
|
||||
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
||||
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
|
||||
ae(len(XPath('//svg:svg')(root)), 1, err)
|
||||
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
|
||||
|
||||
markup = '''
|
||||
<html xmlns="{xhtml}"><head><body id="test">
|
||||
<svg xmlns="{svg}" xmlns:xlink="{xlink}" ><image xlink:href="xxx"/></svg>
|
||||
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
|
||||
root = parse_function(markup)
|
||||
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
||||
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
|
||||
ae(len(XPath('//svg:svg')(root)), 1, err)
|
||||
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
|
||||
|
||||
markup = '<html><body><svg><image xlink:href="xxx"></svg>'
|
||||
root = parse_function(markup)
|
||||
err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root)
|
||||
ae(len(XPath('//svg:svg')(root)), 1, err)
|
||||
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
|
||||
|
||||
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
||||
root = parse_function(markup)
|
||||
err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root)
|
||||
def xpath(expr):
|
||||
return etree.XPath(expr, namespaces={'ns1':'NS', 'ns2':'NS2'})(root)
|
||||
ae(len(xpath('//ns1:tag1')), 1, err)
|
||||
ae(len(xpath('//ns1:tag2')), 1, err)
|
||||
ae(len(xpath('//ns2:tag3')), 1, err)
|
||||
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
|
||||
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
|
||||
|
||||
all_checks = (nonvoid_cdata_elements, namespaces)
|
||||
|
||||
class ParsingTests(BaseTest):
|
||||
|
||||
def test_conversion_parser(self):
|
||||
' Test parsing with the parser used for conversion '
|
||||
for test in all_checks:
|
||||
test(self, html5_parse)
|
Loading…
x
Reference in New Issue
Block a user