When trying to detect the encoding of html, do not use more than the first 10KB so that detection is not too slow

This commit is contained in:
Kovid Goyal 2011-02-16 14:34:09 -07:00
parent a904d5d192
commit fd2e3db07a

View File

@ -53,7 +53,7 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman",
def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw)
chardet = detect(raw[:1024*10])
except:
chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding']