Basic parsing with the new html5lib lxml tree builder works

This commit is contained in:
Kovid Goyal 2013-10-25 14:18:46 +05:30
parent 6d08762344
commit 62d042d9d4
5 changed files with 212 additions and 37 deletions

View File

@ -80,11 +80,14 @@ def node_depth(node):
p = p.getparent() p = p.getparent()
return ans return ans
def fix_self_closing_cdata_tags(data):
from html5lib.constants import cdataElements, rcdataElements
return re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I)
def html5_parse(data, max_nesting_depth=100): def html5_parse(data, max_nesting_depth=100):
import html5lib, warnings import html5lib, warnings
from html5lib.constants import cdataElements, rcdataElements
# HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195 # HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
data = re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I) data = fix_self_closing_cdata_tags(data)
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter('ignore') warnings.simplefilter('ignore')

View File

@ -7,29 +7,38 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy import copy
from functools import partial
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
from html5lib.constants import namespaces from html5lib.constants import namespaces, tableInsertModeElements
from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
from html5lib.ihatexml import InfosetFilter from html5lib.ihatexml import InfosetFilter
from html5lib.html5parser import HTMLParser
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
from calibre.utils.cleantext import clean_xml_chars
infoset_filter = InfosetFilter() infoset_filter = InfosetFilter()
coerce_comment = infoset_filter.coerceComment to_xml_name = infoset_filter.toXmlName
coerce_text = infoset_filter.coerceCharacters known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')}
def create_lxml_context(): def create_lxml_context():
parser = XMLParser() parser = XMLParser(no_network=True)
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment)) parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
return parser return parser
def ElementFactory(name, namespace=None, context=None): def ElementFactory(name, namespace=None, context=None):
context = context or create_lxml_context() context = context or create_lxml_context()
ns = namespace or namespaces['html'] ns = namespace or namespaces['html']
try:
return context.makeelement('{%s}%s' % (ns, name), nsmap={None:ns}) return context.makeelement('{%s}%s' % (ns, name), nsmap={None:ns})
except ValueError:
return context.makeelement('{%s}%s' % (ns, to_xml_name(name)), nsmap={None:ns})
def CommentFactory(text): def CommentFactory(text):
return Comment(coerce_comment(text)) return Comment(text.replace('--', '- -'))
class Element(ElementBase): class Element(ElementBase):
@ -59,15 +68,13 @@ class Element(ElementBase):
def namespace(self): def namespace(self):
return self.nsmap[self.prefix] return self.nsmap[self.prefix]
@dynamic_property @property
def nameTuple(self):
return self.nsmap[self.prefix], self.tag.rpartition('}')[2]
@property
def attributes(self): def attributes(self):
def fget(self):
return self.attrib return self.attrib
def fset(self, val):
attrs = {('{%s}%s' % k) if isinstance(k, tuple) else k : v for k, v in val.iteritems()}
self.attrib.clear()
self.attrib.update(attrs)
return property(fget=fget, fset=fset)
@dynamic_property @dynamic_property
def childNodes(self): def childNodes(self):
@ -94,21 +101,30 @@ class Element(ElementBase):
self.insert(self.index(ref_node), node) self.insert(self.index(ref_node), node)
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
data = coerce_text(data) def append_text(el, attr):
try:
setattr(el, attr, (getattr(el, attr) or '') + data)
except ValueError:
text = data.replace('\u000c', ' ')
try:
setattr(el, attr, (getattr(el, attr) or '') + text)
except ValueError:
setattr(el, attr, (getattr(el, attr) or '') + clean_xml_chars(text))
if len(self) == 0: if len(self) == 0:
self.text = (self.text or '') + data append_text(self, 'text')
elif insertBefore is None: elif insertBefore is None:
# Insert the text as the tail of the last child element # Insert the text as the tail of the last child element
el = self[-1] el = self[-1]
el.tail = (el.tail or '') + data append_text(el, 'tail')
else: else:
# Insert the text before the specified node # Insert the text before the specified node
index = self.index(insertBefore) index = self.index(insertBefore)
if index > 0: if index > 0:
el = self[index - 1] el = self[index - 1]
el.tail = (el.tail or '') + data append_text(el, 'tail')
else: else:
self.text = (self.text or '') + data append_text(self, 'text')
def reparentChildren(self, new_parent): def reparentChildren(self, new_parent):
# Move self.text # Move self.text
@ -129,7 +145,7 @@ class Comment(CommentBase):
def fget(self): def fget(self):
return self.text return self.text
def fset(self, val): def fset(self, val):
self.text = coerce_comment(val) self.text = val.replace('--', '- -')
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
@property @property
@ -144,6 +160,10 @@ class Comment(CommentBase):
def namespace(self): def namespace(self):
return None return None
@property
def nameTuple(self):
return None, None
@property @property
def childNodes(self): def childNodes(self):
return [] return []
@ -164,7 +184,7 @@ class Comment(CommentBase):
reparentChildren = no_op reparentChildren = no_op
def insertText(self, text, insertBefore=None): def insertText(self, text, insertBefore=None):
self.text = (self.text or '') + coerce_comment(text) self.text = (self.text or '') + text.replace('--', '- -')
def cloneNode(self): def cloneNode(self):
return copy.copy(self) return copy.copy(self)
@ -187,6 +207,43 @@ class DocType(object):
self.text = self.name = name self.text = self.name = name
self.public_id, self.system_id = public_id, system_id self.public_id, self.system_id = public_id, system_id
def process_attribs(attrs, nsmap):
attribs = {}
namespaced_attribs = {}
xmlns = namespaces['xmlns']
for k, v in attrs.iteritems():
if isinstance(k, tuple):
if k[2] == xmlns:
prefix, name, ns = k
if prefix is None:
nsmap[None] = v
else:
nsmap[name] = v
else:
attribs['{%s}%s' % (k[2], k[1])] = v
else:
if ':' in k:
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
prefix = k.partition(':')[2] or None
nsmap[prefix] = v
else:
namespaced_attribs[k] = v
else:
attribs[k] = v
for k, v in namespaced_attribs.iteritems():
prefix, name = k.partition(':')[0::2]
if prefix == 'xml':
if name == 'lang':
attribs['lang'] = attribs.get('lang', v)
continue
ns = nsmap.get(prefix, None)
if ns is not None:
name = '{%s}%s' % (ns, name)
attribs[name] =v
return attribs
class TreeBuilder(BaseTreeBuilder): class TreeBuilder(BaseTreeBuilder):
elementClass = ElementFactory elementClass = ElementFactory
@ -194,6 +251,101 @@ class TreeBuilder(BaseTreeBuilder):
documentClass = Document documentClass = Document
doctypeClass = DocType doctypeClass = DocType
def __init__(self): def __init__(self, namespaceHTMLElements=True):
BaseTreeBuilder.__init__(self, True) BaseTreeBuilder.__init__(self, True)
self.lxml_context = create_lxml_context()
self.elementClass = partial(ElementFactory, context=self.lxml_context)
def getDocument(self):
return self.document.root
# The following methods are re-implementations from BaseTreeBuilder to
# handle namespaces properly.
def insertRoot(self, token):
element = self.createElement(token, nsmap={None:namespaces['html']})
self.openElements.append(element)
self.document.appendChild(element)
def createElement(self, token, nsmap=None):
"""Create an element but don't insert it anywhere"""
nsmap = nsmap or {}
attribs = process_attribs(token['data'], nsmap)
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if ':' in name:
prefix, name = name.partition(':')[0::2]
namespace = nsmap.get(prefix, namespace)
try:
elem = self.lxml_context.makeelement('{%s}%s' % (namespace, name), attrib=attribs, nsmap=nsmap)
except ValueError:
attribs = {to_xml_name(k):v for k, v in attribs.iteritems()}
elem = self.lxml_context.makeelement('{%s}%s' % (namespace, to_xml_name(name)), attrib=attribs, nsmap=nsmap)
# Ensure that svg and mathml elements get nice namespace prefixes if
# the input document is HTML 5 with no namespace information
if elem.prefix is not None and elem.prefix.startswith('ns') and namespace not in set(nsmap.itervalues()) and namespace in known_namespaces:
prefix = known_namespaces[namespace]
if prefix not in nsmap:
nsmap[prefix] = namespace
elem = self.lxml_context.makeelement(elem.tag, attrib=elem.attrib, nsmap=nsmap)
return elem
def insertElementNormal(self, token):
parent = self.openElements[-1]
element = self.createElement(token, parent.nsmap)
parent.appendChild(element)
self.openElements.append(element)
return element
def insertElementTable(self, token):
"""Create an element and insert it into the tree"""
if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(token)
# We should be in the InTable mode. This means we want to do
# special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
element = self.createElement(token, nsmap=parent.nsmap)
if insertBefore is None:
parent.appendChild(element)
else:
parent.insertBefore(element, insertBefore)
self.openElements.append(element)
return element
def apply_html_attributes(self, attrs):
html = self.openElements[0]
if len(html) > 0:
raise ValueError('Cannot apply attributes to <html> after it has children')
nsmap = html.nsmap.copy()
attribs = process_attribs(attrs, nsmap)
for k, v in attribs.iteritems():
if k not in html.attrib:
try:
html.set(k, v)
except ValueError:
html.set(to_xml_name(k), v)
if nsmap != html.nsmap:
newroot = self.lxml_context.makeelement(html.tag, attrib=html.attrib, nsmap=nsmap)
self.openElements[0] = newroot
if self.document.root is html:
self.document.root = newroot
def parse(raw, decoder=None):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
# TODO: Replace entities?
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
# TODO: ignore warnings
parser = HTMLParser(tree=TreeBuilder)
parser.parse(raw, parseMeta=False, useChardet=False)
root = parser.tree.getDocument()
return root
if __name__ == '__main__':
from lxml import etree
root = parse('<html><p -moo><gah\u000c>')
print (etree.tostring(root))
print()

View File

@ -10,6 +10,7 @@ from lxml import etree
from html5lib.constants import cdataElements, rcdataElements from html5lib.constants import cdataElements, rcdataElements
from calibre.ebooks.oeb.polish.tests.base import BaseTest from calibre.ebooks.oeb.polish.tests.base import BaseTest
from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
from calibre.ebooks.oeb.parse_utils import html5_parse from calibre.ebooks.oeb.parse_utils import html5_parse
@ -28,11 +29,17 @@ def nonvoid_cdata_elements(test, parse_function):
def namespaces(test, parse_function): def namespaces(test, parse_function):
ae = test.assertEqual ae = test.assertEqual
def match_and_prefix(root, xpath, prefix, err=''):
matches = XPath(xpath)(root)
ae(len(matches), 1, err)
ae(matches[0].prefix, prefix)
markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS) markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS)
root = parse_function(markup) root = parse_function(markup)
ae( ae(
len(XPath('//h:body[@id="test"]')(root)), 1, len(XPath('//h:body[@id="test"]')(root)), 1,
'Incorrect parsing, parsed markup:\n' + etree.tostring(root)) 'Incorrect parsing, parsed markup:\n' + etree.tostring(root))
match_and_prefix(root, '//h:body[@id="test"]', None)
markup = ''' markup = '''
<html xmlns="{xhtml}"><head><body id="test"> <html xmlns="{xhtml}"><head><body id="test">
@ -40,9 +47,9 @@ def namespaces(test, parse_function):
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
root = parse_function(markup) root = parse_function(markup)
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root) err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err) match_and_prefix(root, '//h:body[@id="test"]', None, err)
ae(len(XPath('//svg:svg')(root)), 1, err) match_and_prefix(root, '//svg:svg', 'svg', err)
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
markup = ''' markup = '''
<html xmlns="{xhtml}"><head><body id="test"> <html xmlns="{xhtml}"><head><body id="test">
@ -50,15 +57,15 @@ def namespaces(test, parse_function):
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
root = parse_function(markup) root = parse_function(markup)
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root) err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err) match_and_prefix(root, '//h:body[@id="test"]', None, err)
ae(len(XPath('//svg:svg')(root)), 1, err) match_and_prefix(root, '//svg:svg', None if parse_function is parse else 'svg', err)
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) match_and_prefix(root, '//svg:image[@xl:href]', None if parse_function is parse else 'svg', err)
markup = '<html><body><svg><image xlink:href="xxx"></svg>' markup = '<html><body><svg><image xlink:href="xxx"></svg>'
root = parse_function(markup) root = parse_function(markup)
err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root) err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root)
ae(len(XPath('//svg:svg')(root)), 1, err) match_and_prefix(root, '//svg:svg', 'svg', err)
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>' markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
root = parse_function(markup) root = parse_function(markup)
@ -70,6 +77,9 @@ def namespaces(test, parse_function):
ae(len(xpath('//ns2:tag3')), 1, err) ae(len(xpath('//ns2:tag3')), 1, err)
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err) ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err) ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
for tag in root.iter():
if 'NS' in tag.tag:
ae('ns1', tag.prefix)
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">' markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
root = parse_function(markup) root = parse_function(markup)
@ -84,6 +94,8 @@ def space_characters(test, parse_function):
root = parse_function(markup) root = parse_function(markup)
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root) err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err) test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
markup = '<html><p>\u000b\u000c</p>'
root = parse_function(markup) # Should strip non XML safe control code \u000b
def case_insensitive_element_names(test, parse_function): def case_insensitive_element_names(test, parse_function):
markup = '<HTML><P> </p>' markup = '<HTML><P> </p>'
@ -99,3 +111,8 @@ class ParsingTests(BaseTest):
' Test parsing with the HTML5 parser used for conversion ' ' Test parsing with the HTML5 parser used for conversion '
for test in basic_checks: for test in basic_checks:
test(self, html5_parse) test(self, html5_parse)
def test_polish_parser(self):
' Test parsing with the HTML5 parser used for polishing '
for test in basic_checks:
test(self, parse)

View File

@ -155,8 +155,8 @@ class HTMLParser(object):
new_token = token new_token = token
while new_token is not None: while new_token is not None:
currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNode = self.tree.openElements[-1] if self.tree.openElements else None
currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeNamespace = currentNode.namespace if currentNode is not None else None
currentNodeName = currentNode.name if currentNode else None currentNodeName = currentNode.name if currentNode is not None else None
type = new_token["type"] type = new_token["type"]
@ -472,9 +472,7 @@ def getPhases(debug):
self.parser.parseError("non-html-root") self.parser.parseError("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is # XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError(). # this token... If it's not, invoke self.parser.parseError().
for attr, value in token["data"].items(): self.tree.apply_html_attributes(token['data'])
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False self.parser.firstStartTag = False
def processEndTag(self, token): def processEndTag(self, token):

View File

@ -269,6 +269,11 @@ class TreeBuilder(object):
element.attributes = token["data"] element.attributes = token["data"]
return element return element
def apply_html_attributes(self, attrs):
for attr, value in attrs.items():
if attr not in self.openElements[0].attributes:
self.openElements[0].attributes[attr] = value
def _getInsertFromTable(self): def _getInsertFromTable(self):
return self._insertFromTable return self._insertFromTable