mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
HTML 5 parser: Preserve non-core namespace declarations on the <html> tag
This commit is contained in:
parent
cc731ecd46
commit
d9806a5e6e
@ -275,7 +275,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
|
||||
for k, v in elem.items(): # Only elem.items() preserves attrib order
|
||||
nelem.set(k, v)
|
||||
for (prefix, name), v in namespaced_attribs.iteritems():
|
||||
ns = nsmap.get('prefix', None)
|
||||
ns = nsmap.get(prefix, None)
|
||||
if ns is not None:
|
||||
try:
|
||||
nelem.set('{%s}%s' % (ns, name), v)
|
||||
@ -331,6 +331,13 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
self.openElements.append(element)
|
||||
self.document.appendChild(element)
|
||||
|
||||
def promote_elem(self, elem, tag_name):
|
||||
' Add the paraphernalia to elem that the html5lib infrastructure needs '
|
||||
self.proxy_cache.append(elem)
|
||||
elem.name = tag_name
|
||||
elem.namespace = elem.nsmap[elem.prefix]
|
||||
elem.nameTuple = (elem.nsmap[elem.prefix], elem.name)
|
||||
|
||||
def createElement(self, token, nsmap=None):
|
||||
"""Create an element but don't insert it anywhere"""
|
||||
nsmap = nsmap or {}
|
||||
@ -346,10 +353,7 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
|
||||
# Keep a reference to elem so that lxml does not delete and re-create
|
||||
# it, losing the name related attributes
|
||||
self.proxy_cache.append(elem)
|
||||
elem.name = token_name
|
||||
elem.namespace = elem.nsmap[elem.prefix]
|
||||
elem.nameTuple = (elem.nsmap[elem.prefix], elem.name)
|
||||
self.promote_elem(elem, token_name)
|
||||
position = token.get('position', None)
|
||||
if position is not None:
|
||||
# Unfortunately, libxml2 can only store line numbers upto 65535
|
||||
@ -388,6 +392,18 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def clone_node(self, elem, nsmap_update):
|
||||
assert len(elem) == 0
|
||||
nsmap = elem.nsmap.copy()
|
||||
nsmap.update(nsmap_update)
|
||||
nelem = self.lxml_context.makeelement(elem.tag, nsmap=nsmap)
|
||||
self.promote_elem(nelem, elem.tag.rpartition('}')[2])
|
||||
nelem.sourceline = elem.sourceline
|
||||
for k, v in elem.items():
|
||||
nelem.set(k, v)
|
||||
nelem.text, nelem.tail = elem.text, elem.tail
|
||||
return nelem
|
||||
|
||||
def apply_html_attributes(self, attrs):
|
||||
if not attrs:
|
||||
return
|
||||
@ -403,7 +419,18 @@ class TreeBuilder(BaseTreeBuilder):
|
||||
continue
|
||||
if k == 'xml:lang' and 'lang' not in html.attrib:
|
||||
k = 'lang'
|
||||
html.set(to_xml_name(k), v)
|
||||
html.set(k, v)
|
||||
continue
|
||||
if k.startswith('xmlns:') and v not in known_namespaces and v != namespaces['html'] and len(html) == 0:
|
||||
# We have a namespace declaration, the only way to add
|
||||
# it to the existing html node is to replace it.
|
||||
prefix = k[len('xmlns:'):]
|
||||
if not prefix:
|
||||
continue
|
||||
self.openElements[0] = html = self.clone_node(html, {prefix:v})
|
||||
self.document.appendChild(html)
|
||||
else:
|
||||
html.set(to_xml_name(k), v)
|
||||
|
||||
def apply_body_attributes(self, attrs):
|
||||
if not attrs:
|
||||
|
@ -180,6 +180,9 @@ class ParsingTests(BaseTest):
|
||||
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>')
|
||||
self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed')
|
||||
|
||||
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:extra="extra"><body/></html>')
|
||||
self.assertIn('extra', root.nsmap, 'Extra namespace declaration on <html> tag not preserved')
|
||||
|
||||
def timing():
|
||||
import time, sys
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
Loading…
x
Reference in New Issue
Block a user