From 9e88dfd3b4d8665fc847274cf7fbf5572a686789 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 29 Apr 2020 08:51:32 +0530 Subject: [PATCH] py3 compat --- src/calibre/ebooks/html/input.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 011b53dd35..e37fd3b928 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -91,6 +91,7 @@ class HTMLFile(object): ''' HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) + HTML_PAT_BIN = re.compile(br'<\s*html', re.IGNORECASE) TITLE_PAT = re.compile('([^<>]+)', re.IGNORECASE) LINK_PAT = re.compile( r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', @@ -115,10 +116,13 @@ class HTMLFile(object): encoding = detect_xml_encoding(src)[1] if encoding: try: - header = header.decode(encoding, errors='ignore') + header = header.decode(encoding, errors='replace') except ValueError: pass - self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header)) + self.is_binary = False + if level > 0: + pat = self.HTML_PAT_BIN if isinstance(header, bytes) else self.HTML_PAT + self.is_binary = not bool(pat.search(header)) if not self.is_binary: src += f.read() except IOError as err: