From 2515bd0f4eb72756b7a729ee0fc3f40ea96ab2f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Dec 2011 23:39:45 +0530 Subject: [PATCH] Conversion pipeline: HTML5 parsing: Fix handling of XML namespaces. Fixes regression in 0.8.30 that caused some articles in some news downloads to appear blank when viewed in Adobe Digital Editions based readers --- src/calibre/ebooks/oeb/parse_utils.py | 104 ++++++++++++++++++++------ 1 file changed, 82 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index 57dc18bc32..f8456914b9 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -17,6 +17,7 @@ from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True) XHTML_NS = 'http://www.w3.org/1999/xhtml' +XMLNS_NS = 'http://www.w3.org/2000/xmlns/' class NotHTML(Exception): @@ -28,9 +29,7 @@ def barename(name): return name.rpartition('}')[-1] def namespace(name): - if '}' in name: - return name.split('}', 1)[0][1:] - return '' + return name.rpartition('}')[0][1:] def XHTML(name): return '{%s}%s' % (XHTML_NS, name) @@ -60,26 +59,86 @@ def merge_multiple_html_heads_and_bodies(root, log=None): log.warn('Merging multiple and sections') return root -def _html5_parse(data): +def clone_element(elem, nsmap={}, in_context=True): + if in_context: + maker = elem.getroottree().getroot().makeelement + else: + maker = etree.Element + nelem = maker(elem.tag, attrib=elem.attrib, + nsmap=nsmap) + nelem.text, nelem.tail = elem.text, elem.tail + nelem.extend(elem) + return nelem + +def html5_parse(data): import html5lib data = html5lib.parse(data, treebuilder='lxml').getroot() - html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and - ns is not None)] - if html_ns: - # html5lib causes the XHTML namespace to not - # be set as the default namespace - nsmap = dict(data.nsmap) - nsmap[None] = XHTML_NS - for x in html_ns: - nsmap.pop(x) - nroot = etree.Element(data.tag, nsmap=nsmap, - attrib=dict(data.attrib)) - nroot.text = data.text - nroot.tail = data.tail - for child in data: - nroot.append(child) - data = nroot - return data + # Set lang correctly + xl = data.attrib.pop('xmlU0003Alang', None) + if xl is not None and 'lang' not in data.attrib: + data.attrib['lang'] = xl + + # html5lib has the most inelegant handling of namespaces I have ever seen + # Try to reconstitute destroyed namespace info + xmlns_declaration = '{%s}'%XMLNS_NS + non_html5_namespaces = {} + seen_namespaces = set() + for elem in tuple(data.iter()): + elem.attrib.pop('xmlns', None) + namespaces = {} + for x in tuple(elem.attrib): + if x.startswith('xmlnsU') or x.startswith(xmlns_declaration): + # A namespace declaration + val = elem.attrib.pop(x) + if x.startswith('xmlnsU0003A'): + prefix = x[11:] + namespaces[prefix] = val + + if namespaces: + # Some destroyed namespace declarations were found + p = elem.getparent() + if p is None: + # We handle the root node later + non_html5_namespaces = namespaces + else: + idx = p.index(elem) + p.remove(elem) + elem = clone_element(elem, nsmap=namespaces) + p.insert(idx, elem) + + b = barename(elem.tag) + idx = b.find('U0003A') + if idx > -1: + prefix, tag = b[:idx], b[idx+6:] + ns = elem.nsmap.get(prefix, None) + if ns is None: + ns = non_html5_namespaces.get(prefix, None) + if ns is not None: + elem.tag = '{%s}%s'%(ns, tag) + + for b in tuple(elem.attrib): + idx = b.find('U0003A') + if idx > -1: + prefix, tag = b[:idx], b[idx+6:] + ns = elem.nsmap.get(prefix, None) + if ns is None: + ns = non_html5_namespaces.get(prefix, None) + if ns is not None: + elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b) + + seen_namespaces |= set(elem.nsmap.itervalues()) + + nsmap = dict(html5lib.constants.namespaces) + nsmap[None] = nsmap.pop('html') + non_html5_namespaces.update(nsmap) + nsmap = non_html5_namespaces + + data = clone_element(data, nsmap=nsmap, in_context=False) + + # Remove unused namespace declarations + fnsmap = {k:v for k,v in nsmap.iteritems() if v in seen_namespaces and v != + XMLNS_NS} + return clone_element(data, nsmap=fnsmap, in_context=False) def _html4_parse(data, prefer_soup=False): if prefer_soup: @@ -177,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, except etree.XMLSyntaxError: log.debug('Parsing %s as HTML' % filename) try: - data = _html5_parse(data) + data = html5_parse(data) except: log.exception( 'HTML 5 parsing failed, falling back to older parsers') @@ -261,6 +320,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, nroot.append(elem) data = nroot + data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a head = xpath(data, '/h:html/h:head')