From a82ff8b7493ee5ee4f9d8a4fd4e56ac2c1df4267 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 9 Jan 2016 08:21:00 +0530
Subject: [PATCH] Work around for error when parsing malformed documents
 containing annotation-xml tags

---
 src/html5lib/html5parser.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py
index c453552a59..aa624a094b 100644
--- a/src/html5lib/html5parser.py
+++ b/src/html5lib/html5parser.py
@@ -155,10 +155,17 @@ class HTMLParser(object):
     def isHTMLIntegrationPoint(self, element):
         if (element.name == "annotation-xml" and
                 element.namespace == namespaces["mathml"]):
-            return ("encoding" in element.attributes and
-                    element.attributes["encoding"].translate(
-                        asciiUpper2Lower) in
-                    ("text/html", "application/xhtml+xml"))
+            try:
+                return ("encoding" in element.attributes and
+                        element.attributes["encoding"].translate(
+                            asciiUpper2Lower) in
+                        ("text/html", "application/xhtml+xml"))
+            except TypeError:
+                # This happens for some documents, for some reason
+                # lxml refuses to store a unicode representation of the
+                # encoding attribute.
+                return element.attributes["encoding"].lower().decode('utf-8', 'replace') in (
+                    "text/html", "application/xhtml+xml")
         else:
             return (element.namespace, element.name) in htmlIntegrationPointElements