From a82ff8b7493ee5ee4f9d8a4fd4e56ac2c1df4267 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 9 Jan 2016 08:21:00 +0530 Subject: [PATCH] Work around for error when parsing malformed documents containing annotation-xml tags --- src/html5lib/html5parser.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py index c453552a59..aa624a094b 100644 --- a/src/html5lib/html5parser.py +++ b/src/html5lib/html5parser.py @@ -155,10 +155,17 @@ class HTMLParser(object): def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): - return ("encoding" in element.attributes and - element.attributes["encoding"].translate( - asciiUpper2Lower) in - ("text/html", "application/xhtml+xml")) + try: + return ("encoding" in element.attributes and + element.attributes["encoding"].translate( + asciiUpper2Lower) in + ("text/html", "application/xhtml+xml")) + except TypeError: + # This happens for some documents, for some reason + # lxml refuses to store a unicode representation of the + # encoding attribute. + return element.attributes["encoding"].lower().decode('utf-8', 'replace') in ( + "text/html", "application/xhtml+xml") else: return (element.namespace, element.name) in htmlIntegrationPointElements