mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Parse namespace prefixed <html:html> correctly
This commit is contained in:
parent
d3352aeec9
commit
fe32a39f0f
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import copy
|
||||
import copy, re
|
||||
from functools import partial
|
||||
|
||||
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
||||
@ -24,6 +24,12 @@ infoset_filter = InfosetFilter()
|
||||
to_xml_name = infoset_filter.toXmlName
|
||||
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')}
|
||||
|
||||
class NamespacedHTMLPresent(ValueError):
|
||||
|
||||
def __init__(self, prefix):
|
||||
ValueError.__init__(self, prefix)
|
||||
self.prefix = prefix
|
||||
|
||||
def create_lxml_context():
|
||||
parser = XMLParser(no_network=True)
|
||||
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
|
||||
@ -225,6 +231,12 @@ def process_attribs(attrs, nsmap):
|
||||
if ':' in k:
|
||||
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
|
||||
prefix = k.partition(':')[2] or None
|
||||
if prefix is not None:
|
||||
# Use an existing prefix for this namespace, if
|
||||
# possible
|
||||
existing = {v:k for k, v in nsmap.iteritems()}.get(v, False)
|
||||
if existing is not False:
|
||||
prefix = existing
|
||||
nsmap[prefix] = v
|
||||
else:
|
||||
namespaced_attribs[k] = v
|
||||
@ -255,6 +267,7 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
BaseTreeBuilder.__init__(self, True)
|
||||
self.lxml_context = create_lxml_context()
|
||||
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
||||
self.seen_extra_html = False
|
||||
|
||||
def getDocument(self):
|
||||
return self.document.root
|
||||
@ -272,6 +285,8 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
nsmap = nsmap or {}
|
||||
attribs = process_attribs(token['data'], nsmap)
|
||||
name = token["name"]
|
||||
if name.endswith(':html'):
|
||||
raise NamespacedHTMLPresent(name.rpartition(':')[0])
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
if ':' in name:
|
||||
prefix, name = name.partition(':')[0::2]
|
||||
@ -314,9 +329,9 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
return element
|
||||
|
||||
def apply_html_attributes(self, attrs):
|
||||
if not attrs:
|
||||
return
|
||||
html = self.openElements[0]
|
||||
if len(html) > 0:
|
||||
raise ValueError('Cannot apply attributes to <html> after it has children')
|
||||
nsmap = html.nsmap.copy()
|
||||
attribs = process_attribs(attrs, nsmap)
|
||||
for k, v in attribs.iteritems():
|
||||
@ -330,22 +345,34 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
self.openElements[0] = newroot
|
||||
if self.document.root is html:
|
||||
self.document.root = newroot
|
||||
if len(html) > 0:
|
||||
# TODO: the nsmap changes need to be propagated down the tree
|
||||
for child in html:
|
||||
newroot.append(copy.copy(child))
|
||||
|
||||
def parse(raw, decoder=None):
|
||||
def parse(raw, decoder=None, log=None):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
# TODO: Replace entities?
|
||||
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
||||
# TODO: ignore warnings
|
||||
while True:
|
||||
try:
|
||||
parser = HTMLParser(tree=TreeBuilder)
|
||||
parser.parse(raw, parseMeta=False, useChardet=False)
|
||||
except NamespacedHTMLPresent as err:
|
||||
raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
|
||||
continue
|
||||
break
|
||||
root = parser.tree.getDocument()
|
||||
if root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix:
|
||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||
return root
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from lxml import etree
|
||||
root = parse('<html><p -moo><gah\u000c>')
|
||||
root = parse('<html:html xmlns:html="{html}" id="a"><html:p><html:p></html:html>'.format(html=namespaces['html']))
|
||||
print (etree.tostring(root))
|
||||
print()
|
||||
|
||||
|
@ -67,6 +67,18 @@ def namespaces(test, parse_function):
|
||||
match_and_prefix(root, '//svg:svg', 'svg', err)
|
||||
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
|
||||
|
||||
root = parse_function('<html id="a"><p><html xmlns:x="y" lang="en"><p>')
|
||||
err = 'Multiple HTML tags not handled, parsed markup:\n' + etree.tostring(root)
|
||||
match_and_prefix(root, '//h:html', None, err)
|
||||
match_and_prefix(root, '//h:html[@lang]', None, err)
|
||||
match_and_prefix(root, '//h:html[@id]', None, err)
|
||||
|
||||
if parse_function is not html5_parse:
|
||||
markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS)
|
||||
root = parse_function(markup)
|
||||
err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root)
|
||||
match_and_prefix(root, '//h:html', None, err)
|
||||
|
||||
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
||||
root = parse_function(markup)
|
||||
err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root)
|
||||
|
Loading…
x
Reference in New Issue
Block a user