Bypass decoding errors during html detection

Decoding may fail on the header chunk if the file is utf-8 encoded and the chunk ends on a continuation byte.
This commit is contained in:
Wolfgang Maier 2020-04-29 00:44:15 +02:00 committed by GitHub
parent 529119caad
commit efee7be50d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -115,7 +115,7 @@ class HTMLFile(object):
encoding = detect_xml_encoding(src)[1] encoding = detect_xml_encoding(src)[1]
if encoding: if encoding:
try: try:
header = header.decode(encoding) header = header.decode(encoding, errors='ignore')
except ValueError: except ValueError:
pass pass
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header)) self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))