mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
When detecting the encoding of HTML documents, if the document contains multiple charset declarations, prefer the HTML 5 syntax to the HTML 4 syntax. Fixes #1364961 [Unicode Conversion on Amazon after Release 2.x](https://bugs.launchpad.net/calibre/+bug/1364961)
This commit is contained in:
parent
e905c093b8
commit
78e28cbe9e
@ -12,10 +12,10 @@ import re, codecs
|
||||
ENCODING_PATS = [
|
||||
# XML declaration
|
||||
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
||||
# HTML 4 Pragma directive
|
||||
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
|
||||
# HTML 5 charset
|
||||
re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
|
||||
# HTML 4 Pragma directive
|
||||
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
|
||||
]
|
||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user