Fix explicitly closed <meta charset> tags causing strip_encoding_declarations to result in invalid XML

2025-11-23 15:03:03 -05:00 · 2014-01-31 21:24:39 +05:30 · 2014-01-31 21:24:39 +05:30 · 84a1eb96fa
commit 84a1eb96fa
parent e35586fd00
1 changed files with 10 additions and 14 deletions
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@ -10,16 +10,13 @@ __docformat__ = 'restructuredtext en'
 import re, codecs
 ENCODING_PATS = [
-                # XML declaration
+    # XML declaration
-                 re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
+    re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
-                            re.IGNORECASE),
+    # HTML 4 Pragma directive
-                 # HTML 4 Pragma directive
+    re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
-                 re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
+    # HTML 5 charset
-                            re.IGNORECASE),
+    re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
-                 # HTML 5 charset
+]
                 re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>''',
                     re.IGNORECASE),
                 ]
 ENTITY_PATTERN = re.compile(r'&(\S+?);')
 def strip_encoding_declarations(raw):
@ -35,8 +32,8 @@ def substitute_entites(raw):
    from calibre import xml_entity_to_unicode
    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
-_CHARSET_ALIASES = { "macintosh" : "mac-roman",
+_CHARSET_ALIASES = {"macintosh" : "mac-roman",
-                        "x-sjis" : "shift-jis" }
+                        "x-sjis" : "shift-jis"}
 def detect(*args, **kwargs):
    from chardet import detect
@ -58,8 +55,7 @@ def force_encoding(raw, verbose, assume_utf8=False):
    if not encoding:
        encoding = preferred_encoding
    encoding = encoding.lower()
-    if _CHARSET_ALIASES.has_key(encoding):
+    encoding = _CHARSET_ALIASES.get(encoding, encoding)
        encoding = _CHARSET_ALIASES[encoding]
    if encoding == 'ascii':
        encoding = 'utf-8'
    return encoding