Conversion pipeline: HTML5 parsing: Fix handling of XML namespaces. Fixes regression in 0.8.30 that caused some articles in some news downloads to appear blank when viewed in Adobe Digital Editions based readers

This commit is contained in:
Kovid Goyal 2011-12-18 23:39:45 +05:30
parent 8c1ebd1f40
commit 2515bd0f4e

View File

@ -17,6 +17,7 @@ from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
XHTML_NS = 'http://www.w3.org/1999/xhtml'
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
class NotHTML(Exception):
@ -28,9 +29,7 @@ def barename(name):
return name.rpartition('}')[-1]
def namespace(name):
if '}' in name:
return name.split('}', 1)[0][1:]
return ''
return name.rpartition('}')[0][1:]
def XHTML(name):
return '{%s}%s' % (XHTML_NS, name)
@ -60,26 +59,86 @@ def merge_multiple_html_heads_and_bodies(root, log=None):
log.warn('Merging multiple <head> and <body> sections')
return root
def _html5_parse(data):
def clone_element(elem, nsmap={}, in_context=True):
if in_context:
maker = elem.getroottree().getroot().makeelement
else:
maker = etree.Element
nelem = maker(elem.tag, attrib=elem.attrib,
nsmap=nsmap)
nelem.text, nelem.tail = elem.text, elem.tail
nelem.extend(elem)
return nelem
def html5_parse(data):
import html5lib
data = html5lib.parse(data, treebuilder='lxml').getroot()
html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and
ns is not None)]
if html_ns:
# html5lib causes the XHTML namespace to not
# be set as the default namespace
nsmap = dict(data.nsmap)
nsmap[None] = XHTML_NS
for x in html_ns:
nsmap.pop(x)
nroot = etree.Element(data.tag, nsmap=nsmap,
attrib=dict(data.attrib))
nroot.text = data.text
nroot.tail = data.tail
for child in data:
nroot.append(child)
data = nroot
return data
# Set lang correctly
xl = data.attrib.pop('xmlU0003Alang', None)
if xl is not None and 'lang' not in data.attrib:
data.attrib['lang'] = xl
# html5lib has the most inelegant handling of namespaces I have ever seen
# Try to reconstitute destroyed namespace info
xmlns_declaration = '{%s}'%XMLNS_NS
non_html5_namespaces = {}
seen_namespaces = set()
for elem in tuple(data.iter()):
elem.attrib.pop('xmlns', None)
namespaces = {}
for x in tuple(elem.attrib):
if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
# A namespace declaration
val = elem.attrib.pop(x)
if x.startswith('xmlnsU0003A'):
prefix = x[11:]
namespaces[prefix] = val
if namespaces:
# Some destroyed namespace declarations were found
p = elem.getparent()
if p is None:
# We handle the root node later
non_html5_namespaces = namespaces
else:
idx = p.index(elem)
p.remove(elem)
elem = clone_element(elem, nsmap=namespaces)
p.insert(idx, elem)
b = barename(elem.tag)
idx = b.find('U0003A')
if idx > -1:
prefix, tag = b[:idx], b[idx+6:]
ns = elem.nsmap.get(prefix, None)
if ns is None:
ns = non_html5_namespaces.get(prefix, None)
if ns is not None:
elem.tag = '{%s}%s'%(ns, tag)
for b in tuple(elem.attrib):
idx = b.find('U0003A')
if idx > -1:
prefix, tag = b[:idx], b[idx+6:]
ns = elem.nsmap.get(prefix, None)
if ns is None:
ns = non_html5_namespaces.get(prefix, None)
if ns is not None:
elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)
seen_namespaces |= set(elem.nsmap.itervalues())
nsmap = dict(html5lib.constants.namespaces)
nsmap[None] = nsmap.pop('html')
non_html5_namespaces.update(nsmap)
nsmap = non_html5_namespaces
data = clone_element(data, nsmap=nsmap, in_context=False)
# Remove unused namespace declarations
fnsmap = {k:v for k,v in nsmap.iteritems() if v in seen_namespaces and v !=
XMLNS_NS}
return clone_element(data, nsmap=fnsmap, in_context=False)
def _html4_parse(data, prefer_soup=False):
if prefer_soup:
@ -177,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
except etree.XMLSyntaxError:
log.debug('Parsing %s as HTML' % filename)
try:
data = _html5_parse(data)
data = html5_parse(data)
except:
log.exception(
'HTML 5 parsing failed, falling back to older parsers')
@ -261,6 +320,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
nroot.append(elem)
data = nroot
data = merge_multiple_html_heads_and_bodies(data, log)
# Ensure has a <head/>
head = xpath(data, '/h:html/h:head')