mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Basic parsing with the new html5lib lxml tree builder works
This commit is contained in:
parent
6d08762344
commit
62d042d9d4
@ -80,11 +80,14 @@ def node_depth(node):
|
|||||||
p = p.getparent()
|
p = p.getparent()
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def fix_self_closing_cdata_tags(data):
|
||||||
|
from html5lib.constants import cdataElements, rcdataElements
|
||||||
|
return re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I)
|
||||||
|
|
||||||
def html5_parse(data, max_nesting_depth=100):
|
def html5_parse(data, max_nesting_depth=100):
|
||||||
import html5lib, warnings
|
import html5lib, warnings
|
||||||
from html5lib.constants import cdataElements, rcdataElements
|
|
||||||
# HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
|
# HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
|
||||||
data = re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I)
|
data = fix_self_closing_cdata_tags(data)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter('ignore')
|
warnings.simplefilter('ignore')
|
||||||
|
@ -7,29 +7,38 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
||||||
|
|
||||||
from html5lib.constants import namespaces
|
from html5lib.constants import namespaces, tableInsertModeElements
|
||||||
from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
|
from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
|
||||||
from html5lib.ihatexml import InfosetFilter
|
from html5lib.ihatexml import InfosetFilter
|
||||||
|
from html5lib.html5parser import HTMLParser
|
||||||
|
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
|
||||||
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
|
||||||
infoset_filter = InfosetFilter()
|
infoset_filter = InfosetFilter()
|
||||||
coerce_comment = infoset_filter.coerceComment
|
to_xml_name = infoset_filter.toXmlName
|
||||||
coerce_text = infoset_filter.coerceCharacters
|
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')}
|
||||||
|
|
||||||
def create_lxml_context():
|
def create_lxml_context():
|
||||||
parser = XMLParser()
|
parser = XMLParser(no_network=True)
|
||||||
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
|
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def ElementFactory(name, namespace=None, context=None):
|
def ElementFactory(name, namespace=None, context=None):
|
||||||
context = context or create_lxml_context()
|
context = context or create_lxml_context()
|
||||||
ns = namespace or namespaces['html']
|
ns = namespace or namespaces['html']
|
||||||
return context.makeelement('{%s}%s' % (ns, name), nsmap={None:ns})
|
try:
|
||||||
|
return context.makeelement('{%s}%s' % (ns, name), nsmap={None:ns})
|
||||||
|
except ValueError:
|
||||||
|
return context.makeelement('{%s}%s' % (ns, to_xml_name(name)), nsmap={None:ns})
|
||||||
|
|
||||||
def CommentFactory(text):
|
def CommentFactory(text):
|
||||||
return Comment(coerce_comment(text))
|
return Comment(text.replace('--', '- -'))
|
||||||
|
|
||||||
class Element(ElementBase):
|
class Element(ElementBase):
|
||||||
|
|
||||||
@ -59,15 +68,13 @@ class Element(ElementBase):
|
|||||||
def namespace(self):
|
def namespace(self):
|
||||||
return self.nsmap[self.prefix]
|
return self.nsmap[self.prefix]
|
||||||
|
|
||||||
@dynamic_property
|
@property
|
||||||
|
def nameTuple(self):
|
||||||
|
return self.nsmap[self.prefix], self.tag.rpartition('}')[2]
|
||||||
|
|
||||||
|
@property
|
||||||
def attributes(self):
|
def attributes(self):
|
||||||
def fget(self):
|
return self.attrib
|
||||||
return self.attrib
|
|
||||||
def fset(self, val):
|
|
||||||
attrs = {('{%s}%s' % k) if isinstance(k, tuple) else k : v for k, v in val.iteritems()}
|
|
||||||
self.attrib.clear()
|
|
||||||
self.attrib.update(attrs)
|
|
||||||
return property(fget=fget, fset=fset)
|
|
||||||
|
|
||||||
@dynamic_property
|
@dynamic_property
|
||||||
def childNodes(self):
|
def childNodes(self):
|
||||||
@ -94,21 +101,30 @@ class Element(ElementBase):
|
|||||||
self.insert(self.index(ref_node), node)
|
self.insert(self.index(ref_node), node)
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
def insertText(self, data, insertBefore=None):
|
||||||
data = coerce_text(data)
|
def append_text(el, attr):
|
||||||
|
try:
|
||||||
|
setattr(el, attr, (getattr(el, attr) or '') + data)
|
||||||
|
except ValueError:
|
||||||
|
text = data.replace('\u000c', ' ')
|
||||||
|
try:
|
||||||
|
setattr(el, attr, (getattr(el, attr) or '') + text)
|
||||||
|
except ValueError:
|
||||||
|
setattr(el, attr, (getattr(el, attr) or '') + clean_xml_chars(text))
|
||||||
|
|
||||||
if len(self) == 0:
|
if len(self) == 0:
|
||||||
self.text = (self.text or '') + data
|
append_text(self, 'text')
|
||||||
elif insertBefore is None:
|
elif insertBefore is None:
|
||||||
# Insert the text as the tail of the last child element
|
# Insert the text as the tail of the last child element
|
||||||
el = self[-1]
|
el = self[-1]
|
||||||
el.tail = (el.tail or '') + data
|
append_text(el, 'tail')
|
||||||
else:
|
else:
|
||||||
# Insert the text before the specified node
|
# Insert the text before the specified node
|
||||||
index = self.index(insertBefore)
|
index = self.index(insertBefore)
|
||||||
if index > 0:
|
if index > 0:
|
||||||
el = self[index - 1]
|
el = self[index - 1]
|
||||||
el.tail = (el.tail or '') + data
|
append_text(el, 'tail')
|
||||||
else:
|
else:
|
||||||
self.text = (self.text or '') + data
|
append_text(self, 'text')
|
||||||
|
|
||||||
def reparentChildren(self, new_parent):
|
def reparentChildren(self, new_parent):
|
||||||
# Move self.text
|
# Move self.text
|
||||||
@ -129,7 +145,7 @@ class Comment(CommentBase):
|
|||||||
def fget(self):
|
def fget(self):
|
||||||
return self.text
|
return self.text
|
||||||
def fset(self, val):
|
def fset(self, val):
|
||||||
self.text = coerce_comment(val)
|
self.text = val.replace('--', '- -')
|
||||||
return property(fget=fget, fset=fset)
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -144,6 +160,10 @@ class Comment(CommentBase):
|
|||||||
def namespace(self):
|
def namespace(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nameTuple(self):
|
||||||
|
return None, None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def childNodes(self):
|
def childNodes(self):
|
||||||
return []
|
return []
|
||||||
@ -164,7 +184,7 @@ class Comment(CommentBase):
|
|||||||
reparentChildren = no_op
|
reparentChildren = no_op
|
||||||
|
|
||||||
def insertText(self, text, insertBefore=None):
|
def insertText(self, text, insertBefore=None):
|
||||||
self.text = (self.text or '') + coerce_comment(text)
|
self.text = (self.text or '') + text.replace('--', '- -')
|
||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
return copy.copy(self)
|
return copy.copy(self)
|
||||||
@ -187,6 +207,43 @@ class DocType(object):
|
|||||||
self.text = self.name = name
|
self.text = self.name = name
|
||||||
self.public_id, self.system_id = public_id, system_id
|
self.public_id, self.system_id = public_id, system_id
|
||||||
|
|
||||||
|
def process_attribs(attrs, nsmap):
|
||||||
|
attribs = {}
|
||||||
|
namespaced_attribs = {}
|
||||||
|
xmlns = namespaces['xmlns']
|
||||||
|
for k, v in attrs.iteritems():
|
||||||
|
if isinstance(k, tuple):
|
||||||
|
if k[2] == xmlns:
|
||||||
|
prefix, name, ns = k
|
||||||
|
if prefix is None:
|
||||||
|
nsmap[None] = v
|
||||||
|
else:
|
||||||
|
nsmap[name] = v
|
||||||
|
else:
|
||||||
|
attribs['{%s}%s' % (k[2], k[1])] = v
|
||||||
|
else:
|
||||||
|
if ':' in k:
|
||||||
|
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
|
||||||
|
prefix = k.partition(':')[2] or None
|
||||||
|
nsmap[prefix] = v
|
||||||
|
else:
|
||||||
|
namespaced_attribs[k] = v
|
||||||
|
else:
|
||||||
|
attribs[k] = v
|
||||||
|
|
||||||
|
for k, v in namespaced_attribs.iteritems():
|
||||||
|
prefix, name = k.partition(':')[0::2]
|
||||||
|
if prefix == 'xml':
|
||||||
|
if name == 'lang':
|
||||||
|
attribs['lang'] = attribs.get('lang', v)
|
||||||
|
continue
|
||||||
|
ns = nsmap.get(prefix, None)
|
||||||
|
if ns is not None:
|
||||||
|
name = '{%s}%s' % (ns, name)
|
||||||
|
attribs[name] =v
|
||||||
|
|
||||||
|
return attribs
|
||||||
|
|
||||||
class TreeBuilder(BaseTreeBuilder):
|
class TreeBuilder(BaseTreeBuilder):
|
||||||
|
|
||||||
elementClass = ElementFactory
|
elementClass = ElementFactory
|
||||||
@ -194,6 +251,101 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
documentClass = Document
|
documentClass = Document
|
||||||
doctypeClass = DocType
|
doctypeClass = DocType
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, namespaceHTMLElements=True):
|
||||||
BaseTreeBuilder.__init__(self, True)
|
BaseTreeBuilder.__init__(self, True)
|
||||||
|
self.lxml_context = create_lxml_context()
|
||||||
|
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.document.root
|
||||||
|
|
||||||
|
# The following methods are re-implementations from BaseTreeBuilder to
|
||||||
|
# handle namespaces properly.
|
||||||
|
|
||||||
|
def insertRoot(self, token):
|
||||||
|
element = self.createElement(token, nsmap={None:namespaces['html']})
|
||||||
|
self.openElements.append(element)
|
||||||
|
self.document.appendChild(element)
|
||||||
|
|
||||||
|
def createElement(self, token, nsmap=None):
|
||||||
|
"""Create an element but don't insert it anywhere"""
|
||||||
|
nsmap = nsmap or {}
|
||||||
|
attribs = process_attribs(token['data'], nsmap)
|
||||||
|
name = token["name"]
|
||||||
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
|
if ':' in name:
|
||||||
|
prefix, name = name.partition(':')[0::2]
|
||||||
|
namespace = nsmap.get(prefix, namespace)
|
||||||
|
try:
|
||||||
|
elem = self.lxml_context.makeelement('{%s}%s' % (namespace, name), attrib=attribs, nsmap=nsmap)
|
||||||
|
except ValueError:
|
||||||
|
attribs = {to_xml_name(k):v for k, v in attribs.iteritems()}
|
||||||
|
elem = self.lxml_context.makeelement('{%s}%s' % (namespace, to_xml_name(name)), attrib=attribs, nsmap=nsmap)
|
||||||
|
|
||||||
|
# Ensure that svg and mathml elements get nice namespace prefixes if
|
||||||
|
# the input document is HTML 5 with no namespace information
|
||||||
|
if elem.prefix is not None and elem.prefix.startswith('ns') and namespace not in set(nsmap.itervalues()) and namespace in known_namespaces:
|
||||||
|
prefix = known_namespaces[namespace]
|
||||||
|
if prefix not in nsmap:
|
||||||
|
nsmap[prefix] = namespace
|
||||||
|
elem = self.lxml_context.makeelement(elem.tag, attrib=elem.attrib, nsmap=nsmap)
|
||||||
|
return elem
|
||||||
|
|
||||||
|
def insertElementNormal(self, token):
|
||||||
|
parent = self.openElements[-1]
|
||||||
|
element = self.createElement(token, parent.nsmap)
|
||||||
|
parent.appendChild(element)
|
||||||
|
self.openElements.append(element)
|
||||||
|
return element
|
||||||
|
|
||||||
|
def insertElementTable(self, token):
|
||||||
|
"""Create an element and insert it into the tree"""
|
||||||
|
if self.openElements[-1].name not in tableInsertModeElements:
|
||||||
|
return self.insertElementNormal(token)
|
||||||
|
# We should be in the InTable mode. This means we want to do
|
||||||
|
# special magic element rearranging
|
||||||
|
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||||
|
element = self.createElement(token, nsmap=parent.nsmap)
|
||||||
|
if insertBefore is None:
|
||||||
|
parent.appendChild(element)
|
||||||
|
else:
|
||||||
|
parent.insertBefore(element, insertBefore)
|
||||||
|
self.openElements.append(element)
|
||||||
|
return element
|
||||||
|
|
||||||
|
def apply_html_attributes(self, attrs):
|
||||||
|
html = self.openElements[0]
|
||||||
|
if len(html) > 0:
|
||||||
|
raise ValueError('Cannot apply attributes to <html> after it has children')
|
||||||
|
nsmap = html.nsmap.copy()
|
||||||
|
attribs = process_attribs(attrs, nsmap)
|
||||||
|
for k, v in attribs.iteritems():
|
||||||
|
if k not in html.attrib:
|
||||||
|
try:
|
||||||
|
html.set(k, v)
|
||||||
|
except ValueError:
|
||||||
|
html.set(to_xml_name(k), v)
|
||||||
|
if nsmap != html.nsmap:
|
||||||
|
newroot = self.lxml_context.makeelement(html.tag, attrib=html.attrib, nsmap=nsmap)
|
||||||
|
self.openElements[0] = newroot
|
||||||
|
if self.document.root is html:
|
||||||
|
self.document.root = newroot
|
||||||
|
|
||||||
|
def parse(raw, decoder=None):
|
||||||
|
if isinstance(raw, bytes):
|
||||||
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
|
# TODO: Replace entities?
|
||||||
|
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
||||||
|
# TODO: ignore warnings
|
||||||
|
parser = HTMLParser(tree=TreeBuilder)
|
||||||
|
parser.parse(raw, parseMeta=False, useChardet=False)
|
||||||
|
root = parser.tree.getDocument()
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from lxml import etree
|
||||||
|
root = parse('<html><p -moo><gah\u000c>')
|
||||||
|
print (etree.tostring(root))
|
||||||
|
print()
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@ from lxml import etree
|
|||||||
from html5lib.constants import cdataElements, rcdataElements
|
from html5lib.constants import cdataElements, rcdataElements
|
||||||
|
|
||||||
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
||||||
|
from calibre.ebooks.oeb.polish.parsing import parse
|
||||||
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
|
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
|
||||||
from calibre.ebooks.oeb.parse_utils import html5_parse
|
from calibre.ebooks.oeb.parse_utils import html5_parse
|
||||||
|
|
||||||
@ -28,11 +29,17 @@ def nonvoid_cdata_elements(test, parse_function):
|
|||||||
|
|
||||||
def namespaces(test, parse_function):
|
def namespaces(test, parse_function):
|
||||||
ae = test.assertEqual
|
ae = test.assertEqual
|
||||||
|
def match_and_prefix(root, xpath, prefix, err=''):
|
||||||
|
matches = XPath(xpath)(root)
|
||||||
|
ae(len(matches), 1, err)
|
||||||
|
ae(matches[0].prefix, prefix)
|
||||||
|
|
||||||
markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS)
|
markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS)
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
ae(
|
ae(
|
||||||
len(XPath('//h:body[@id="test"]')(root)), 1,
|
len(XPath('//h:body[@id="test"]')(root)), 1,
|
||||||
'Incorrect parsing, parsed markup:\n' + etree.tostring(root))
|
'Incorrect parsing, parsed markup:\n' + etree.tostring(root))
|
||||||
|
match_and_prefix(root, '//h:body[@id="test"]', None)
|
||||||
|
|
||||||
markup = '''
|
markup = '''
|
||||||
<html xmlns="{xhtml}"><head><body id="test">
|
<html xmlns="{xhtml}"><head><body id="test">
|
||||||
@ -40,9 +47,9 @@ def namespaces(test, parse_function):
|
|||||||
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
|
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
||||||
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
|
match_and_prefix(root, '//h:body[@id="test"]', None, err)
|
||||||
ae(len(XPath('//svg:svg')(root)), 1, err)
|
match_and_prefix(root, '//svg:svg', 'svg', err)
|
||||||
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
|
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
|
||||||
|
|
||||||
markup = '''
|
markup = '''
|
||||||
<html xmlns="{xhtml}"><head><body id="test">
|
<html xmlns="{xhtml}"><head><body id="test">
|
||||||
@ -50,15 +57,15 @@ def namespaces(test, parse_function):
|
|||||||
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
|
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
||||||
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
|
match_and_prefix(root, '//h:body[@id="test"]', None, err)
|
||||||
ae(len(XPath('//svg:svg')(root)), 1, err)
|
match_and_prefix(root, '//svg:svg', None if parse_function is parse else 'svg', err)
|
||||||
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
|
match_and_prefix(root, '//svg:image[@xl:href]', None if parse_function is parse else 'svg', err)
|
||||||
|
|
||||||
markup = '<html><body><svg><image xlink:href="xxx"></svg>'
|
markup = '<html><body><svg><image xlink:href="xxx"></svg>'
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root)
|
err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root)
|
||||||
ae(len(XPath('//svg:svg')(root)), 1, err)
|
match_and_prefix(root, '//svg:svg', 'svg', err)
|
||||||
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
|
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
|
||||||
|
|
||||||
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
@ -70,6 +77,9 @@ def namespaces(test, parse_function):
|
|||||||
ae(len(xpath('//ns2:tag3')), 1, err)
|
ae(len(xpath('//ns2:tag3')), 1, err)
|
||||||
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
|
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
|
||||||
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
|
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
|
||||||
|
for tag in root.iter():
|
||||||
|
if 'NS' in tag.tag:
|
||||||
|
ae('ns1', tag.prefix)
|
||||||
|
|
||||||
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
|
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
@ -84,6 +94,8 @@ def space_characters(test, parse_function):
|
|||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
|
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
|
||||||
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
||||||
|
markup = '<html><p>\u000b\u000c</p>'
|
||||||
|
root = parse_function(markup) # Should strip non XML safe control code \u000b
|
||||||
|
|
||||||
def case_insensitive_element_names(test, parse_function):
|
def case_insensitive_element_names(test, parse_function):
|
||||||
markup = '<HTML><P> </p>'
|
markup = '<HTML><P> </p>'
|
||||||
@ -99,3 +111,8 @@ class ParsingTests(BaseTest):
|
|||||||
' Test parsing with the HTML5 parser used for conversion '
|
' Test parsing with the HTML5 parser used for conversion '
|
||||||
for test in basic_checks:
|
for test in basic_checks:
|
||||||
test(self, html5_parse)
|
test(self, html5_parse)
|
||||||
|
|
||||||
|
def test_polish_parser(self):
|
||||||
|
' Test parsing with the HTML5 parser used for polishing '
|
||||||
|
for test in basic_checks:
|
||||||
|
test(self, parse)
|
||||||
|
@ -155,8 +155,8 @@ class HTMLParser(object):
|
|||||||
new_token = token
|
new_token = token
|
||||||
while new_token is not None:
|
while new_token is not None:
|
||||||
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
|
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
|
||||||
currentNodeNamespace = currentNode.namespace if currentNode else None
|
currentNodeNamespace = currentNode.namespace if currentNode is not None else None
|
||||||
currentNodeName = currentNode.name if currentNode else None
|
currentNodeName = currentNode.name if currentNode is not None else None
|
||||||
|
|
||||||
type = new_token["type"]
|
type = new_token["type"]
|
||||||
|
|
||||||
@ -472,9 +472,7 @@ def getPhases(debug):
|
|||||||
self.parser.parseError("non-html-root")
|
self.parser.parseError("non-html-root")
|
||||||
# XXX Need a check here to see if the first start tag token emitted is
|
# XXX Need a check here to see if the first start tag token emitted is
|
||||||
# this token... If it's not, invoke self.parser.parseError().
|
# this token... If it's not, invoke self.parser.parseError().
|
||||||
for attr, value in token["data"].items():
|
self.tree.apply_html_attributes(token['data'])
|
||||||
if attr not in self.tree.openElements[0].attributes:
|
|
||||||
self.tree.openElements[0].attributes[attr] = value
|
|
||||||
self.parser.firstStartTag = False
|
self.parser.firstStartTag = False
|
||||||
|
|
||||||
def processEndTag(self, token):
|
def processEndTag(self, token):
|
||||||
|
@ -269,6 +269,11 @@ class TreeBuilder(object):
|
|||||||
element.attributes = token["data"]
|
element.attributes = token["data"]
|
||||||
return element
|
return element
|
||||||
|
|
||||||
|
def apply_html_attributes(self, attrs):
|
||||||
|
for attr, value in attrs.items():
|
||||||
|
if attr not in self.openElements[0].attributes:
|
||||||
|
self.openElements[0].attributes[attr] = value
|
||||||
|
|
||||||
def _getInsertFromTable(self):
|
def _getInsertFromTable(self):
|
||||||
return self._insertFromTable
|
return self._insertFromTable
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user