diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py
index 158b15fe49..f634f0a77e 100644
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@@ -10,16 +10,13 @@ __docformat__ = 'restructuredtext en'
import re, codecs
ENCODING_PATS = [
- # XML declaration
- re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
- re.IGNORECASE),
- # HTML 4 Pragma directive
- re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
- re.IGNORECASE),
- # HTML 5 charset
- re.compile(r''']*>''',
- re.IGNORECASE),
- ]
+ # XML declaration
+ re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
+ # HTML 4 Pragma directive
+ re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*){0,1}''', re.IGNORECASE),
+ # HTML 5 charset
+ re.compile(r''']*>(?:\s*){0,1}''', re.IGNORECASE),
+]
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw):
@@ -35,8 +32,8 @@ def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
-_CHARSET_ALIASES = { "macintosh" : "mac-roman",
- "x-sjis" : "shift-jis" }
+_CHARSET_ALIASES = {"macintosh" : "mac-roman",
+ "x-sjis" : "shift-jis"}
def detect(*args, **kwargs):
from chardet import detect
@@ -58,8 +55,7 @@ def force_encoding(raw, verbose, assume_utf8=False):
if not encoding:
encoding = preferred_encoding
encoding = encoding.lower()
- if _CHARSET_ALIASES.has_key(encoding):
- encoding = _CHARSET_ALIASES[encoding]
+ encoding = _CHARSET_ALIASES.get(encoding, encoding)
if encoding == 'ascii':
encoding = 'utf-8'
return encoding