diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 158b15fe49..f634f0a77e 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -10,16 +10,13 @@ __docformat__ = 'restructuredtext en' import re, codecs ENCODING_PATS = [ - # XML declaration - re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', - re.IGNORECASE), - # HTML 4 Pragma directive - re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''', - re.IGNORECASE), - # HTML 5 charset - re.compile(r''']*>''', - re.IGNORECASE), - ] + # XML declaration + re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), + # HTML 4 Pragma directive + re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*){0,1}''', re.IGNORECASE), + # HTML 5 charset + re.compile(r''']*>(?:\s*){0,1}''', re.IGNORECASE), +] ENTITY_PATTERN = re.compile(r'&(\S+?);') def strip_encoding_declarations(raw): @@ -35,8 +32,8 @@ def substitute_entites(raw): from calibre import xml_entity_to_unicode return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) -_CHARSET_ALIASES = { "macintosh" : "mac-roman", - "x-sjis" : "shift-jis" } +_CHARSET_ALIASES = {"macintosh" : "mac-roman", + "x-sjis" : "shift-jis"} def detect(*args, **kwargs): from chardet import detect @@ -58,8 +55,7 @@ def force_encoding(raw, verbose, assume_utf8=False): if not encoding: encoding = preferred_encoding encoding = encoding.lower() - if _CHARSET_ALIASES.has_key(encoding): - encoding = _CHARSET_ALIASES[encoding] + encoding = _CHARSET_ALIASES.get(encoding, encoding) if encoding == 'ascii': encoding = 'utf-8' return encoding