diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 864d09108b..158b15fe49 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -10,10 +10,15 @@ __docformat__ = 'restructuredtext en' import re, codecs ENCODING_PATS = [ + # XML declaration re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), + # HTML 4 Pragma directive re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''', re.IGNORECASE), + # HTML 5 charset + re.compile(r''']*>''', + re.IGNORECASE), ] ENTITY_PATTERN = re.compile(r'&(\S+?);') diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 079eb90590..de06899852 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en' import re, os +from calibre.ebooks.chardet import strip_encoding_declarations + def update_internal_links(mobi8_reader): # need to update all links that are internal which # are based on positions within the xhtml files **BEFORE** @@ -324,6 +326,8 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log): for i, part in enumerate(parts): pi = mobi8_reader.partinfo[i] with open(os.path.join(pi.type, pi.filename), 'wb') as f: + part = strip_encoding_declarations(part) + part = part.replace('
', '') f.write(part.encode('utf-8')) spine.append(f.name)