From 95190b45ad92b49a32ffb5ec45d07a648e5cc7e4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 10 Jul 2012 23:19:45 +0530 Subject: [PATCH] KF8 Input: Ignore encoding declarations inside the html markup, as they are sometimes incorrect. Fixes #1022933 (Ebook Viewer shows random Chinese words) --- src/calibre/ebooks/chardet.py | 5 +++++ src/calibre/ebooks/mobi/reader/markup.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 864d09108b..158b15fe49 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -10,10 +10,15 @@ __docformat__ = 'restructuredtext en' import re, codecs ENCODING_PATS = [ + # XML declaration re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), + # HTML 4 Pragma directive re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''', re.IGNORECASE), + # HTML 5 charset + re.compile(r''']*>''', + re.IGNORECASE), ] ENTITY_PATTERN = re.compile(r'&(\S+?);') diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 079eb90590..de06899852 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en' import re, os +from calibre.ebooks.chardet import strip_encoding_declarations + def update_internal_links(mobi8_reader): # need to update all links that are internal which # are based on positions within the xhtml files **BEFORE** @@ -324,6 +326,8 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log): for i, part in enumerate(parts): pi = mobi8_reader.partinfo[i] with open(os.path.join(pi.type, pi.filename), 'wb') as f: + part = strip_encoding_declarations(part) + part = part.replace('', '') f.write(part.encode('utf-8')) spine.append(f.name)