mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
HTML 5 parser: Preserve non-core namespace declarations on the <html> tag
This commit is contained in:
parent
cc731ecd46
commit
d9806a5e6e
@ -275,7 +275,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
|
|||||||
for k, v in elem.items(): # Only elem.items() preserves attrib order
|
for k, v in elem.items(): # Only elem.items() preserves attrib order
|
||||||
nelem.set(k, v)
|
nelem.set(k, v)
|
||||||
for (prefix, name), v in namespaced_attribs.iteritems():
|
for (prefix, name), v in namespaced_attribs.iteritems():
|
||||||
ns = nsmap.get('prefix', None)
|
ns = nsmap.get(prefix, None)
|
||||||
if ns is not None:
|
if ns is not None:
|
||||||
try:
|
try:
|
||||||
nelem.set('{%s}%s' % (ns, name), v)
|
nelem.set('{%s}%s' % (ns, name), v)
|
||||||
@ -331,6 +331,13 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
self.openElements.append(element)
|
self.openElements.append(element)
|
||||||
self.document.appendChild(element)
|
self.document.appendChild(element)
|
||||||
|
|
||||||
|
def promote_elem(self, elem, tag_name):
|
||||||
|
' Add the paraphernalia to elem that the html5lib infrastructure needs '
|
||||||
|
self.proxy_cache.append(elem)
|
||||||
|
elem.name = tag_name
|
||||||
|
elem.namespace = elem.nsmap[elem.prefix]
|
||||||
|
elem.nameTuple = (elem.nsmap[elem.prefix], elem.name)
|
||||||
|
|
||||||
def createElement(self, token, nsmap=None):
|
def createElement(self, token, nsmap=None):
|
||||||
"""Create an element but don't insert it anywhere"""
|
"""Create an element but don't insert it anywhere"""
|
||||||
nsmap = nsmap or {}
|
nsmap = nsmap or {}
|
||||||
@ -346,10 +353,7 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
|
|
||||||
# Keep a reference to elem so that lxml does not delete and re-create
|
# Keep a reference to elem so that lxml does not delete and re-create
|
||||||
# it, losing the name related attributes
|
# it, losing the name related attributes
|
||||||
self.proxy_cache.append(elem)
|
self.promote_elem(elem, token_name)
|
||||||
elem.name = token_name
|
|
||||||
elem.namespace = elem.nsmap[elem.prefix]
|
|
||||||
elem.nameTuple = (elem.nsmap[elem.prefix], elem.name)
|
|
||||||
position = token.get('position', None)
|
position = token.get('position', None)
|
||||||
if position is not None:
|
if position is not None:
|
||||||
# Unfortunately, libxml2 can only store line numbers upto 65535
|
# Unfortunately, libxml2 can only store line numbers upto 65535
|
||||||
@ -388,6 +392,18 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
self.openElements.append(element)
|
self.openElements.append(element)
|
||||||
return element
|
return element
|
||||||
|
|
||||||
|
def clone_node(self, elem, nsmap_update):
|
||||||
|
assert len(elem) == 0
|
||||||
|
nsmap = elem.nsmap.copy()
|
||||||
|
nsmap.update(nsmap_update)
|
||||||
|
nelem = self.lxml_context.makeelement(elem.tag, nsmap=nsmap)
|
||||||
|
self.promote_elem(nelem, elem.tag.rpartition('}')[2])
|
||||||
|
nelem.sourceline = elem.sourceline
|
||||||
|
for k, v in elem.items():
|
||||||
|
nelem.set(k, v)
|
||||||
|
nelem.text, nelem.tail = elem.text, elem.tail
|
||||||
|
return nelem
|
||||||
|
|
||||||
def apply_html_attributes(self, attrs):
|
def apply_html_attributes(self, attrs):
|
||||||
if not attrs:
|
if not attrs:
|
||||||
return
|
return
|
||||||
@ -403,7 +419,18 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
continue
|
continue
|
||||||
if k == 'xml:lang' and 'lang' not in html.attrib:
|
if k == 'xml:lang' and 'lang' not in html.attrib:
|
||||||
k = 'lang'
|
k = 'lang'
|
||||||
html.set(to_xml_name(k), v)
|
html.set(k, v)
|
||||||
|
continue
|
||||||
|
if k.startswith('xmlns:') and v not in known_namespaces and v != namespaces['html'] and len(html) == 0:
|
||||||
|
# We have a namespace declaration, the only way to add
|
||||||
|
# it to the existing html node is to replace it.
|
||||||
|
prefix = k[len('xmlns:'):]
|
||||||
|
if not prefix:
|
||||||
|
continue
|
||||||
|
self.openElements[0] = html = self.clone_node(html, {prefix:v})
|
||||||
|
self.document.appendChild(html)
|
||||||
|
else:
|
||||||
|
html.set(to_xml_name(k), v)
|
||||||
|
|
||||||
def apply_body_attributes(self, attrs):
|
def apply_body_attributes(self, attrs):
|
||||||
if not attrs:
|
if not attrs:
|
||||||
|
@ -180,6 +180,9 @@ class ParsingTests(BaseTest):
|
|||||||
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>')
|
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>')
|
||||||
self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed')
|
self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed')
|
||||||
|
|
||||||
|
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:extra="extra"><body/></html>')
|
||||||
|
self.assertIn('extra', root.nsmap, 'Extra namespace declaration on <html> tag not preserved')
|
||||||
|
|
||||||
def timing():
|
def timing():
|
||||||
import time, sys
|
import time, sys
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
Loading…
x
Reference in New Issue
Block a user