KF8 Input: Ignore encoding declarations inside the html markup, as they are sometimes incorrect. Fixes #1022933 (Ebook Viewer shows random Chinese words)

2025-11-16 19:43:03 -05:00 · 2012-07-10 23:19:45 +05:30 · 2012-07-10 23:19:45 +05:30 · 95190b45ad
commit 95190b45ad
parent 32a52b9e05
2 changed files with 9 additions and 0 deletions
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@ -10,10 +10,15 @@ __docformat__ = 'restructuredtext en'
 import re, codecs
 ENCODING_PATS = [
                # XML declaration
                 re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
                            re.IGNORECASE),
                 # HTML 4 Pragma directive
                 re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
                            re.IGNORECASE),
                 # HTML 5 charset
                 re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>''',
                     re.IGNORECASE),
                 ]
 ENTITY_PATTERN = re.compile(r'&(\S+?);')
--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
 import re, os
 from calibre.ebooks.chardet import strip_encoding_declarations
 def update_internal_links(mobi8_reader):
    # need to update all links that are internal which
    # are based on positions within the xhtml files **BEFORE**
@ -324,6 +326,8 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
    for i, part in enumerate(parts):
        pi = mobi8_reader.partinfo[i]
        with open(os.path.join(pi.type, pi.filename), 'wb') as f:
            part = strip_encoding_declarations(part)
            part = part.replace('<head>', '<head><meta charset="UTF-8"/>')
            f.write(part.encode('utf-8'))
            spine.append(f.name)