From ad34b0ea3b2d0ba514e93d17b060a3d9af1247eb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 15 Oct 2022 18:02:11 +0530 Subject: [PATCH] We can no longer rely on confidence from chardet since its always 1 with the move to the C based chardet library MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So for files where we assume utf-8, use utf-8 if no explicit encoding is found. Fixes #1993029 [Apostrophe in book title turns into "à€™" upon import](https://bugs.launchpad.net/calibre/+bug/1993029) --- src/calibre/ebooks/chardet.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 4aeceea0fd..afa0d9b4d8 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -154,6 +154,11 @@ def detect_xml_encoding(raw, verbose=False, assume_utf8=False): encoding = encoding.decode('ascii', 'replace') break if encoding is None: + if assume_utf8: + try: + return raw.decode('utf-8'), 'utf-8' + except UnicodeDecodeError: + pass encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8) if encoding.lower().strip() == 'macintosh': encoding = 'mac-roman'