From 62d042d9d42227813c8ee91cbfbd157e7096ab2a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 25 Oct 2013 14:18:46 +0530 Subject: [PATCH] Basic parsing with the new html5lib lxml tree builder works --- src/calibre/ebooks/oeb/parse_utils.py | 7 +- src/calibre/ebooks/oeb/polish/parsing.py | 196 ++++++++++++++++-- .../ebooks/oeb/polish/tests/parsing.py | 33 ++- src/html5lib/html5parser.py | 8 +- src/html5lib/treebuilders/_base.py | 5 + 5 files changed, 212 insertions(+), 37 deletions(-) diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index 88d9a198c3..7ee9f5131f 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -80,11 +80,14 @@ def node_depth(node): p = p.getparent() return ans +def fix_self_closing_cdata_tags(data): + from html5lib.constants import cdataElements, rcdataElements + return re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1>', data, flags=re.I) + def html5_parse(data, max_nesting_depth=100): import html5lib, warnings - from html5lib.constants import cdataElements, rcdataElements # HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195 - data = re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1>', data, flags=re.I) + data = fix_self_closing_cdata_tags(data) with warnings.catch_warnings(): warnings.simplefilter('ignore') diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index 3377931c73..2d1c204425 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -7,29 +7,38 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' import copy +from functools import partial from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase -from html5lib.constants import namespaces +from html5lib.constants import namespaces, tableInsertModeElements from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder from html5lib.ihatexml import InfosetFilter +from html5lib.html5parser import HTMLParser + +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags +from calibre.utils.cleantext import clean_xml_chars infoset_filter = InfosetFilter() -coerce_comment = infoset_filter.coerceComment -coerce_text = infoset_filter.coerceCharacters +to_xml_name = infoset_filter.toXmlName +known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')} def create_lxml_context(): - parser = XMLParser() + parser = XMLParser(no_network=True) parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment)) return parser def ElementFactory(name, namespace=None, context=None): context = context or create_lxml_context() ns = namespace or namespaces['html'] - return context.makeelement('{%s}%s' % (ns, name), nsmap={None:ns}) + try: + return context.makeelement('{%s}%s' % (ns, name), nsmap={None:ns}) + except ValueError: + return context.makeelement('{%s}%s' % (ns, to_xml_name(name)), nsmap={None:ns}) def CommentFactory(text): - return Comment(coerce_comment(text)) + return Comment(text.replace('--', '- -')) class Element(ElementBase): @@ -59,15 +68,13 @@ class Element(ElementBase): def namespace(self): return self.nsmap[self.prefix] - @dynamic_property + @property + def nameTuple(self): + return self.nsmap[self.prefix], self.tag.rpartition('}')[2] + + @property def attributes(self): - def fget(self): - return self.attrib - def fset(self, val): - attrs = {('{%s}%s' % k) if isinstance(k, tuple) else k : v for k, v in val.iteritems()} - self.attrib.clear() - self.attrib.update(attrs) - return property(fget=fget, fset=fset) + return self.attrib @dynamic_property def childNodes(self): @@ -94,21 +101,30 @@ class Element(ElementBase): self.insert(self.index(ref_node), node) def insertText(self, data, insertBefore=None): - data = coerce_text(data) + def append_text(el, attr): + try: + setattr(el, attr, (getattr(el, attr) or '') + data) + except ValueError: + text = data.replace('\u000c', ' ') + try: + setattr(el, attr, (getattr(el, attr) or '') + text) + except ValueError: + setattr(el, attr, (getattr(el, attr) or '') + clean_xml_chars(text)) + if len(self) == 0: - self.text = (self.text or '') + data + append_text(self, 'text') elif insertBefore is None: # Insert the text as the tail of the last child element el = self[-1] - el.tail = (el.tail or '') + data + append_text(el, 'tail') else: # Insert the text before the specified node index = self.index(insertBefore) if index > 0: el = self[index - 1] - el.tail = (el.tail or '') + data + append_text(el, 'tail') else: - self.text = (self.text or '') + data + append_text(self, 'text') def reparentChildren(self, new_parent): # Move self.text @@ -129,7 +145,7 @@ class Comment(CommentBase): def fget(self): return self.text def fset(self, val): - self.text = coerce_comment(val) + self.text = val.replace('--', '- -') return property(fget=fget, fset=fset) @property @@ -144,6 +160,10 @@ class Comment(CommentBase): def namespace(self): return None + @property + def nameTuple(self): + return None, None + @property def childNodes(self): return [] @@ -164,7 +184,7 @@ class Comment(CommentBase): reparentChildren = no_op def insertText(self, text, insertBefore=None): - self.text = (self.text or '') + coerce_comment(text) + self.text = (self.text or '') + text.replace('--', '- -') def cloneNode(self): return copy.copy(self) @@ -187,6 +207,43 @@ class DocType(object): self.text = self.name = name self.public_id, self.system_id = public_id, system_id +def process_attribs(attrs, nsmap): + attribs = {} + namespaced_attribs = {} + xmlns = namespaces['xmlns'] + for k, v in attrs.iteritems(): + if isinstance(k, tuple): + if k[2] == xmlns: + prefix, name, ns = k + if prefix is None: + nsmap[None] = v + else: + nsmap[name] = v + else: + attribs['{%s}%s' % (k[2], k[1])] = v + else: + if ':' in k: + if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'): + prefix = k.partition(':')[2] or None + nsmap[prefix] = v + else: + namespaced_attribs[k] = v + else: + attribs[k] = v + + for k, v in namespaced_attribs.iteritems(): + prefix, name = k.partition(':')[0::2] + if prefix == 'xml': + if name == 'lang': + attribs['lang'] = attribs.get('lang', v) + continue + ns = nsmap.get(prefix, None) + if ns is not None: + name = '{%s}%s' % (ns, name) + attribs[name] =v + + return attribs + class TreeBuilder(BaseTreeBuilder): elementClass = ElementFactory @@ -194,6 +251,101 @@ class TreeBuilder(BaseTreeBuilder): documentClass = Document doctypeClass = DocType - def __init__(self): + def __init__(self, namespaceHTMLElements=True): BaseTreeBuilder.__init__(self, True) + self.lxml_context = create_lxml_context() + self.elementClass = partial(ElementFactory, context=self.lxml_context) + + def getDocument(self): + return self.document.root + + # The following methods are re-implementations from BaseTreeBuilder to + # handle namespaces properly. + + def insertRoot(self, token): + element = self.createElement(token, nsmap={None:namespaces['html']}) + self.openElements.append(element) + self.document.appendChild(element) + + def createElement(self, token, nsmap=None): + """Create an element but don't insert it anywhere""" + nsmap = nsmap or {} + attribs = process_attribs(token['data'], nsmap) + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + if ':' in name: + prefix, name = name.partition(':')[0::2] + namespace = nsmap.get(prefix, namespace) + try: + elem = self.lxml_context.makeelement('{%s}%s' % (namespace, name), attrib=attribs, nsmap=nsmap) + except ValueError: + attribs = {to_xml_name(k):v for k, v in attribs.iteritems()} + elem = self.lxml_context.makeelement('{%s}%s' % (namespace, to_xml_name(name)), attrib=attribs, nsmap=nsmap) + + # Ensure that svg and mathml elements get nice namespace prefixes if + # the input document is HTML 5 with no namespace information + if elem.prefix is not None and elem.prefix.startswith('ns') and namespace not in set(nsmap.itervalues()) and namespace in known_namespaces: + prefix = known_namespaces[namespace] + if prefix not in nsmap: + nsmap[prefix] = namespace + elem = self.lxml_context.makeelement(elem.tag, attrib=elem.attrib, nsmap=nsmap) + return elem + + def insertElementNormal(self, token): + parent = self.openElements[-1] + element = self.createElement(token, parent.nsmap) + parent.appendChild(element) + self.openElements.append(element) + return element + + def insertElementTable(self, token): + """Create an element and insert it into the tree""" + if self.openElements[-1].name not in tableInsertModeElements: + return self.insertElementNormal(token) + # We should be in the InTable mode. This means we want to do + # special magic element rearranging + parent, insertBefore = self.getTableMisnestedNodePosition() + element = self.createElement(token, nsmap=parent.nsmap) + if insertBefore is None: + parent.appendChild(element) + else: + parent.insertBefore(element, insertBefore) + self.openElements.append(element) + return element + + def apply_html_attributes(self, attrs): + html = self.openElements[0] + if len(html) > 0: + raise ValueError('Cannot apply attributes to after it has children') + nsmap = html.nsmap.copy() + attribs = process_attribs(attrs, nsmap) + for k, v in attribs.iteritems(): + if k not in html.attrib: + try: + html.set(k, v) + except ValueError: + html.set(to_xml_name(k), v) + if nsmap != html.nsmap: + newroot = self.lxml_context.makeelement(html.tag, attrib=html.attrib, nsmap=nsmap) + self.openElements[0] = newroot + if self.document.root is html: + self.document.root = newroot + +def parse(raw, decoder=None): + if isinstance(raw, bytes): + raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) + # TODO: Replace entities? + raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser + # TODO: ignore warnings + parser = HTMLParser(tree=TreeBuilder) + parser.parse(raw, parseMeta=False, useChardet=False) + root = parser.tree.getDocument() + return root + + +if __name__ == '__main__': + from lxml import etree + root = parse('

') + print (etree.tostring(root)) + print() diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py index 014985e02d..ce5d18b494 100644 --- a/src/calibre/ebooks/oeb/polish/tests/parsing.py +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -10,6 +10,7 @@ from lxml import etree from html5lib.constants import cdataElements, rcdataElements from calibre.ebooks.oeb.polish.tests.base import BaseTest +from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS from calibre.ebooks.oeb.parse_utils import html5_parse @@ -28,11 +29,17 @@ def nonvoid_cdata_elements(test, parse_function): def namespaces(test, parse_function): ae = test.assertEqual + def match_and_prefix(root, xpath, prefix, err=''): + matches = XPath(xpath)(root) + ae(len(matches), 1, err) + ae(matches[0].prefix, prefix) + markup = ''' '''.format(xhtml=XHTML_NS) root = parse_function(markup) ae( len(XPath('//h:body[@id="test"]')(root)), 1, 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)) + match_and_prefix(root, '//h:body[@id="test"]', None) markup = ''' @@ -40,9 +47,9 @@ def namespaces(test, parse_function): '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) root = parse_function(markup) err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root) - ae(len(XPath('//h:body[@id="test"]')(root)), 1, err) - ae(len(XPath('//svg:svg')(root)), 1, err) - ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) + match_and_prefix(root, '//h:body[@id="test"]', None, err) + match_and_prefix(root, '//svg:svg', 'svg', err) + match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err) markup = ''' @@ -50,15 +57,15 @@ def namespaces(test, parse_function): '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) root = parse_function(markup) err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root) - ae(len(XPath('//h:body[@id="test"]')(root)), 1, err) - ae(len(XPath('//svg:svg')(root)), 1, err) - ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) + match_and_prefix(root, '//h:body[@id="test"]', None, err) + match_and_prefix(root, '//svg:svg', None if parse_function is parse else 'svg', err) + match_and_prefix(root, '//svg:image[@xl:href]', None if parse_function is parse else 'svg', err) markup = '' root = parse_function(markup) err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root) - ae(len(XPath('//svg:svg')(root)), 1, err) - ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err) + match_and_prefix(root, '//svg:svg', 'svg', err) + match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err) markup = '' root = parse_function(markup) @@ -70,6 +77,9 @@ def namespaces(test, parse_function): ae(len(xpath('//ns2:tag3')), 1, err) ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err) ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err) + for tag in root.iter(): + if 'NS' in tag.tag: + ae('ns1', tag.prefix) markup = '

' root = parse_function(markup) @@ -84,6 +94,8 @@ def space_characters(test, parse_function): root = parse_function(markup) err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root) test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err) + markup = '

\u000b\u000c

' + root = parse_function(markup) # Should strip non XML safe control code \u000b def case_insensitive_element_names(test, parse_function): markup = '

' @@ -99,3 +111,8 @@ class ParsingTests(BaseTest): ' Test parsing with the HTML5 parser used for conversion ' for test in basic_checks: test(self, html5_parse) + + def test_polish_parser(self): + ' Test parsing with the HTML5 parser used for polishing ' + for test in basic_checks: + test(self, parse) diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py index b0f14f3935..91897f4362 100644 --- a/src/html5lib/html5parser.py +++ b/src/html5lib/html5parser.py @@ -155,8 +155,8 @@ class HTMLParser(object): new_token = token while new_token is not None: currentNode = self.tree.openElements[-1] if self.tree.openElements else None - currentNodeNamespace = currentNode.namespace if currentNode else None - currentNodeName = currentNode.name if currentNode else None + currentNodeNamespace = currentNode.namespace if currentNode is not None else None + currentNodeName = currentNode.name if currentNode is not None else None type = new_token["type"] @@ -472,9 +472,7 @@ def getPhases(debug): self.parser.parseError("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[0].attributes: - self.tree.openElements[0].attributes[attr] = value + self.tree.apply_html_attributes(token['data']) self.parser.firstStartTag = False def processEndTag(self, token): diff --git a/src/html5lib/treebuilders/_base.py b/src/html5lib/treebuilders/_base.py index 8b97cc11a2..f426823c1f 100644 --- a/src/html5lib/treebuilders/_base.py +++ b/src/html5lib/treebuilders/_base.py @@ -269,6 +269,11 @@ class TreeBuilder(object): element.attributes = token["data"] return element + def apply_html_attributes(self, attrs): + for attr, value in attrs.items(): + if attr not in self.openElements[0].attributes: + self.openElements[0].attributes[attr] = value + def _getInsertFromTable(self): return self._insertFromTable