Work around for error when parsing malformed documents containing annotation-xml tags

2025-07-09 03:04:10 -04:00 · 2016-01-09 08:21:00 +05:30 · 2016-01-09 08:21:00 +05:30 · a82ff8b749
commit a82ff8b749
parent fe84fd3519
1 changed files with 11 additions and 4 deletions
--- a/src/html5lib/html5parser.py
+++ b/src/html5lib/html5parser.py
@ -155,10 +155,17 @@ class HTMLParser(object):
    def isHTMLIntegrationPoint(self, element):
        if (element.name == "annotation-xml" and
                element.namespace == namespaces["mathml"]):
-            return ("encoding" in element.attributes and
-                    element.attributes["encoding"].translate(
-                        asciiUpper2Lower) in
-                    ("text/html", "application/xhtml+xml"))
+            try:
+                return ("encoding" in element.attributes and
+                        element.attributes["encoding"].translate(
+                            asciiUpper2Lower) in
+                        ("text/html", "application/xhtml+xml"))
+            except TypeError:
+                # This happens for some documents, for some reason
+                # lxml refuses to store a unicode representation of the
+                # encoding attribute.
+                return element.attributes["encoding"].lower().decode('utf-8', 'replace') in (
+                    "text/html", "application/xhtml+xml")
        else:
            return (element.namespace, element.name) in htmlIntegrationPointElements