diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 011b53dd35..e37fd3b928 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -91,6 +91,7 @@ class HTMLFile(object): ''' HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) + HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE) TITLE_PAT = re.compile('([^<>]+)', re.IGNORECASE) LINK_PAT = re.compile( r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', @@ -115,10 +116,13 @@ class HTMLFile(object): encoding = detect_xml_encoding(src)[1] if encoding: try: - header = header.decode(encoding, errors='ignore') + header = header.decode(encoding, errors='replace') except ValueError: pass - self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header)) + self.is_binary = False + if level > 0: + pat = self.HTML_PAT_BIN if isinstance(header, bytes) else self.HTML_PAT + self.is_binary = not bool(pat.search(header)) if not self.is_binary: src += f.read() except IOError as err: