Fix bug in regex to extract charset from <meta> tags

This commit is contained in:
Kovid Goyal 2010-11-05 18:59:20 -06:00
parent 43103496ab
commit 5f6ff5609d

View File

@ -32,7 +32,7 @@ def detect(aBuf):
ENCODING_PATS = [
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
re.IGNORECASE),
re.compile(r'''<meta\s+?[^<>]+?content\s*=\s*['"][^'"]*?charset=([-a-z0-9]+)[^'"]*?['"][^<>]*>''',
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-a-z0-9]+)[^'"]*?['"][^<>]*>''',
re.IGNORECASE)
]
ENTITY_PATTERN = re.compile(r'&(\S+?);')