Fix explicitly closed <meta charset> tags causing strip_encoding_declarations to result in invalid XML

This commit is contained in:
Kovid Goyal 2014-01-31 21:24:39 +05:30
parent e35586fd00
commit 84a1eb96fa

View File

@ -10,16 +10,13 @@ __docformat__ = 'restructuredtext en'
import re, codecs import re, codecs
ENCODING_PATS = [ ENCODING_PATS = [
# XML declaration # XML declaration
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
re.IGNORECASE), # HTML 4 Pragma directive
# HTML 4 Pragma directive re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''', # HTML 5 charset
re.IGNORECASE), re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
# HTML 5 charset ]
re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>''',
re.IGNORECASE),
]
ENTITY_PATTERN = re.compile(r'&(\S+?);') ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw): def strip_encoding_declarations(raw):
@ -35,8 +32,8 @@ def substitute_entites(raw):
from calibre import xml_entity_to_unicode from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = { "macintosh" : "mac-roman", _CHARSET_ALIASES = {"macintosh" : "mac-roman",
"x-sjis" : "shift-jis" } "x-sjis" : "shift-jis"}
def detect(*args, **kwargs): def detect(*args, **kwargs):
from chardet import detect from chardet import detect
@ -58,8 +55,7 @@ def force_encoding(raw, verbose, assume_utf8=False):
if not encoding: if not encoding:
encoding = preferred_encoding encoding = preferred_encoding
encoding = encoding.lower() encoding = encoding.lower()
if _CHARSET_ALIASES.has_key(encoding): encoding = _CHARSET_ALIASES.get(encoding, encoding)
encoding = _CHARSET_ALIASES[encoding]
if encoding == 'ascii': if encoding == 'ascii':
encoding = 'utf-8' encoding = 'utf-8'
return encoding return encoding