mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Parse namespace prefixed <html:html> correctly
This commit is contained in:
parent
d3352aeec9
commit
fe32a39f0f
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import copy
|
import copy, re
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
||||||
@ -24,6 +24,12 @@ infoset_filter = InfosetFilter()
|
|||||||
to_xml_name = infoset_filter.toXmlName
|
to_xml_name = infoset_filter.toXmlName
|
||||||
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')}
|
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')}
|
||||||
|
|
||||||
|
class NamespacedHTMLPresent(ValueError):
|
||||||
|
|
||||||
|
def __init__(self, prefix):
|
||||||
|
ValueError.__init__(self, prefix)
|
||||||
|
self.prefix = prefix
|
||||||
|
|
||||||
def create_lxml_context():
|
def create_lxml_context():
|
||||||
parser = XMLParser(no_network=True)
|
parser = XMLParser(no_network=True)
|
||||||
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
|
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
|
||||||
@ -225,6 +231,12 @@ def process_attribs(attrs, nsmap):
|
|||||||
if ':' in k:
|
if ':' in k:
|
||||||
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
|
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
|
||||||
prefix = k.partition(':')[2] or None
|
prefix = k.partition(':')[2] or None
|
||||||
|
if prefix is not None:
|
||||||
|
# Use an existing prefix for this namespace, if
|
||||||
|
# possible
|
||||||
|
existing = {v:k for k, v in nsmap.iteritems()}.get(v, False)
|
||||||
|
if existing is not False:
|
||||||
|
prefix = existing
|
||||||
nsmap[prefix] = v
|
nsmap[prefix] = v
|
||||||
else:
|
else:
|
||||||
namespaced_attribs[k] = v
|
namespaced_attribs[k] = v
|
||||||
@ -255,6 +267,7 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
BaseTreeBuilder.__init__(self, True)
|
BaseTreeBuilder.__init__(self, True)
|
||||||
self.lxml_context = create_lxml_context()
|
self.lxml_context = create_lxml_context()
|
||||||
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
||||||
|
self.seen_extra_html = False
|
||||||
|
|
||||||
def getDocument(self):
|
def getDocument(self):
|
||||||
return self.document.root
|
return self.document.root
|
||||||
@ -272,6 +285,8 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
nsmap = nsmap or {}
|
nsmap = nsmap or {}
|
||||||
attribs = process_attribs(token['data'], nsmap)
|
attribs = process_attribs(token['data'], nsmap)
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
|
if name.endswith(':html'):
|
||||||
|
raise NamespacedHTMLPresent(name.rpartition(':')[0])
|
||||||
namespace = token.get("namespace", self.defaultNamespace)
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
if ':' in name:
|
if ':' in name:
|
||||||
prefix, name = name.partition(':')[0::2]
|
prefix, name = name.partition(':')[0::2]
|
||||||
@ -314,9 +329,9 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
return element
|
return element
|
||||||
|
|
||||||
def apply_html_attributes(self, attrs):
|
def apply_html_attributes(self, attrs):
|
||||||
|
if not attrs:
|
||||||
|
return
|
||||||
html = self.openElements[0]
|
html = self.openElements[0]
|
||||||
if len(html) > 0:
|
|
||||||
raise ValueError('Cannot apply attributes to <html> after it has children')
|
|
||||||
nsmap = html.nsmap.copy()
|
nsmap = html.nsmap.copy()
|
||||||
attribs = process_attribs(attrs, nsmap)
|
attribs = process_attribs(attrs, nsmap)
|
||||||
for k, v in attribs.iteritems():
|
for k, v in attribs.iteritems():
|
||||||
@ -330,22 +345,34 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
self.openElements[0] = newroot
|
self.openElements[0] = newroot
|
||||||
if self.document.root is html:
|
if self.document.root is html:
|
||||||
self.document.root = newroot
|
self.document.root = newroot
|
||||||
|
if len(html) > 0:
|
||||||
|
# TODO: the nsmap changes need to be propagated down the tree
|
||||||
|
for child in html:
|
||||||
|
newroot.append(copy.copy(child))
|
||||||
|
|
||||||
def parse(raw, decoder=None):
|
def parse(raw, decoder=None, log=None):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
# TODO: Replace entities?
|
# TODO: Replace entities?
|
||||||
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
||||||
# TODO: ignore warnings
|
# TODO: ignore warnings
|
||||||
parser = HTMLParser(tree=TreeBuilder)
|
while True:
|
||||||
parser.parse(raw, parseMeta=False, useChardet=False)
|
try:
|
||||||
|
parser = HTMLParser(tree=TreeBuilder)
|
||||||
|
parser.parse(raw, parseMeta=False, useChardet=False)
|
||||||
|
except NamespacedHTMLPresent as err:
|
||||||
|
raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
|
||||||
|
continue
|
||||||
|
break
|
||||||
root = parser.tree.getDocument()
|
root = parser.tree.getDocument()
|
||||||
|
if root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix:
|
||||||
|
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||||
return root
|
return root
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
root = parse('<html><p -moo><gah\u000c>')
|
root = parse('<html:html xmlns:html="{html}" id="a"><html:p><html:p></html:html>'.format(html=namespaces['html']))
|
||||||
print (etree.tostring(root))
|
print (etree.tostring(root))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
@ -67,6 +67,18 @@ def namespaces(test, parse_function):
|
|||||||
match_and_prefix(root, '//svg:svg', 'svg', err)
|
match_and_prefix(root, '//svg:svg', 'svg', err)
|
||||||
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
|
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
|
||||||
|
|
||||||
|
root = parse_function('<html id="a"><p><html xmlns:x="y" lang="en"><p>')
|
||||||
|
err = 'Multiple HTML tags not handled, parsed markup:\n' + etree.tostring(root)
|
||||||
|
match_and_prefix(root, '//h:html', None, err)
|
||||||
|
match_and_prefix(root, '//h:html[@lang]', None, err)
|
||||||
|
match_and_prefix(root, '//h:html[@id]', None, err)
|
||||||
|
|
||||||
|
if parse_function is not html5_parse:
|
||||||
|
markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS)
|
||||||
|
root = parse_function(markup)
|
||||||
|
err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root)
|
||||||
|
match_and_prefix(root, '//h:html', None, err)
|
||||||
|
|
||||||
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root)
|
err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user