diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 011b53dd35..e37fd3b928 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -91,6 +91,7 @@ class HTMLFile(object):
'''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
+ HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE)
TITLE_PAT = re.compile('
([^<>]+)', re.IGNORECASE)
LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))',
@@ -115,10 +116,13 @@ class HTMLFile(object):
encoding = detect_xml_encoding(src)[1]
if encoding:
try:
- header = header.decode(encoding, errors='ignore')
+ header = header.decode(encoding, errors='replace')
except ValueError:
pass
- self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
+ self.is_binary = False
+ if level > 0:
+ pat = self.HTML_PAT_BIN if isinstance(header, bytes) else self.HTML_PAT
+ self.is_binary = not bool(pat.search(header))
if not self.is_binary:
src += f.read()
except IOError as err: