Conversion pipeline: HTML5 parsing: Fix handling of XML namespaces. Fixes regression in 0.8.30 that caused some articles in some news downloads to appear blank when viewed in Adobe Digital Editions based readers

2025-07-09 03:04:10 -04:00 · 2011-12-18 23:39:45 +05:30 · 2011-12-18 23:39:45 +05:30 · 2515bd0f4e
commit 2515bd0f4e
parent 8c1ebd1f40
1 changed files with 82 additions and 22 deletions
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -17,6 +17,7 @@ from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
 RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
 XHTML_NS     = 'http://www.w3.org/1999/xhtml'
 XMLNS_NS     = 'http://www.w3.org/2000/xmlns/'
 class NotHTML(Exception):
@ -28,9 +29,7 @@ def barename(name):
    return name.rpartition('}')[-1]
 def namespace(name):
-    if '}' in name:
+    return name.rpartition('}')[0][1:]
        return name.split('}', 1)[0][1:]
    return ''
 def XHTML(name):
    return '{%s}%s' % (XHTML_NS, name)
@ -60,26 +59,86 @@ def merge_multiple_html_heads_and_bodies(root, log=None):
        log.warn('Merging multiple <head> and <body> sections')
    return root
-def _html5_parse(data):
+def clone_element(elem, nsmap={}, in_context=True):
    if in_context:
        maker = elem.getroottree().getroot().makeelement
    else:
        maker = etree.Element
    nelem = maker(elem.tag, attrib=elem.attrib,
            nsmap=nsmap)
    nelem.text, nelem.tail = elem.text, elem.tail
    nelem.extend(elem)
    return nelem
 def html5_parse(data):
    import html5lib
    data = html5lib.parse(data, treebuilder='lxml').getroot()
-    html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and
+    # Set lang correctly
-            ns is not None)]
+    xl = data.attrib.pop('xmlU0003Alang', None)
-    if html_ns:
+    if xl is not None and 'lang' not in data.attrib:
-        # html5lib causes the XHTML namespace to not
+        data.attrib['lang'] = xl
-        # be set as the default namespace
+
-        nsmap = dict(data.nsmap)
+    # html5lib has the most inelegant handling of namespaces I have ever seen
-        nsmap[None] = XHTML_NS
+    # Try to reconstitute destroyed namespace info
-        for x in html_ns:
+    xmlns_declaration = '{%s}'%XMLNS_NS
-            nsmap.pop(x)
+    non_html5_namespaces = {}
-        nroot = etree.Element(data.tag, nsmap=nsmap,
+    seen_namespaces = set()
-                attrib=dict(data.attrib))
+    for elem in tuple(data.iter()):
-        nroot.text = data.text
+        elem.attrib.pop('xmlns', None)
-        nroot.tail = data.tail
+        namespaces = {}
-        for child in data:
+        for x in tuple(elem.attrib):
-            nroot.append(child)
+            if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
-        data = nroot
+                # A namespace declaration
-    return data
+                val = elem.attrib.pop(x)
                if x.startswith('xmlnsU0003A'):
                    prefix = x[11:]
                    namespaces[prefix] = val
        if namespaces:
            # Some destroyed namespace declarations were found
            p = elem.getparent()
            if p is None:
                # We handle the root node later
                non_html5_namespaces = namespaces
            else:
                idx = p.index(elem)
                p.remove(elem)
                elem = clone_element(elem, nsmap=namespaces)
                p.insert(idx, elem)
        b = barename(elem.tag)
        idx = b.find('U0003A')
        if idx > -1:
            prefix, tag = b[:idx], b[idx+6:]
            ns = elem.nsmap.get(prefix, None)
            if ns is None:
                ns = non_html5_namespaces.get(prefix, None)
            if ns is not None:
                elem.tag = '{%s}%s'%(ns, tag)
        for b in tuple(elem.attrib):
            idx = b.find('U0003A')
            if idx > -1:
                prefix, tag = b[:idx], b[idx+6:]
                ns = elem.nsmap.get(prefix, None)
                if ns is None:
                    ns = non_html5_namespaces.get(prefix, None)
                if ns is not None:
                    elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)
        seen_namespaces |= set(elem.nsmap.itervalues())
    nsmap = dict(html5lib.constants.namespaces)
    nsmap[None] = nsmap.pop('html')
    non_html5_namespaces.update(nsmap)
    nsmap = non_html5_namespaces
    data = clone_element(data, nsmap=nsmap, in_context=False)
    # Remove unused namespace declarations
    fnsmap = {k:v for k,v in nsmap.iteritems() if v in seen_namespaces and v !=
            XMLNS_NS}
    return clone_element(data, nsmap=fnsmap, in_context=False)
 def _html4_parse(data, prefer_soup=False):
    if prefer_soup:
@ -177,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
        except etree.XMLSyntaxError:
            log.debug('Parsing %s as HTML' % filename)
            try:
-                data = _html5_parse(data)
+                data = html5_parse(data)
            except:
                log.exception(
                    'HTML 5 parsing failed, falling back to older parsers')
@ -261,6 +320,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
            nroot.append(elem)
        data = nroot
    data = merge_multiple_html_heads_and_bodies(data, log)
    # Ensure has a <head/>
    head = xpath(data, '/h:html/h:head')