mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Fix explicitly closed <meta charset> tags causing strip_encoding_declarations to result in invalid XML
This commit is contained in:
parent
e35586fd00
commit
84a1eb96fa
@ -11,15 +11,12 @@ import re, codecs
|
|||||||
|
|
||||||
ENCODING_PATS = [
|
ENCODING_PATS = [
|
||||||
# XML declaration
|
# XML declaration
|
||||||
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
|
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
||||||
re.IGNORECASE),
|
|
||||||
# HTML 4 Pragma directive
|
# HTML 4 Pragma directive
|
||||||
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
|
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
|
||||||
re.IGNORECASE),
|
|
||||||
# HTML 5 charset
|
# HTML 5 charset
|
||||||
re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>''',
|
re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
|
||||||
re.IGNORECASE),
|
]
|
||||||
]
|
|
||||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||||
|
|
||||||
def strip_encoding_declarations(raw):
|
def strip_encoding_declarations(raw):
|
||||||
@ -35,8 +32,8 @@ def substitute_entites(raw):
|
|||||||
from calibre import xml_entity_to_unicode
|
from calibre import xml_entity_to_unicode
|
||||||
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
||||||
|
|
||||||
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
|
||||||
"x-sjis" : "shift-jis" }
|
"x-sjis" : "shift-jis"}
|
||||||
|
|
||||||
def detect(*args, **kwargs):
|
def detect(*args, **kwargs):
|
||||||
from chardet import detect
|
from chardet import detect
|
||||||
@ -58,8 +55,7 @@ def force_encoding(raw, verbose, assume_utf8=False):
|
|||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = preferred_encoding
|
encoding = preferred_encoding
|
||||||
encoding = encoding.lower()
|
encoding = encoding.lower()
|
||||||
if _CHARSET_ALIASES.has_key(encoding):
|
encoding = _CHARSET_ALIASES.get(encoding, encoding)
|
||||||
encoding = _CHARSET_ALIASES[encoding]
|
|
||||||
if encoding == 'ascii':
|
if encoding == 'ascii':
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
return encoding
|
return encoding
|
||||||
|
Loading…
x
Reference in New Issue
Block a user