Split out the chardet library from the calibre package and upgrade it to version 2.0.1

This commit is contained in:
Kovid Goyal 2011-12-26 10:50:54 +05:30
parent b552d25fe0
commit 7233f134c6
38 changed files with 73 additions and 45 deletions

View File

@ -1,34 +1,15 @@
######################## BEGIN LICENSE BLOCK ########################
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__version__ = "1.0"
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, codecs
from chardet import detect
def detect(aBuf):
import calibre.ebooks.chardet.universaldetector as universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)
u.close()
return u.result
# Added by Kovid
ENCODING_PATS = [
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
re.IGNORECASE),
@ -63,7 +44,8 @@ def force_encoding(raw, verbose, assume_utf8=False):
if chardet['confidence'] < 1 and assume_utf8:
encoding = 'utf-8'
if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
print('WARNING: Encoding detection confidence %d%%'%(
chardet['confidence']*100))
if not encoding:
encoding = preferred_encoding
encoding = encoding.lower()
@ -113,7 +95,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
@return: (unicode, encoding used)
'''
if not raw:
return u'', None
return '', None
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
assume_utf8=assume_utf8)
if not isinstance(raw, unicode):

View File

@ -188,7 +188,7 @@ except:
# chardet library auto-detects character encodings
# Download from http://chardet.feedparser.org/
try:
import calibre.ebooks.chardet as chardet # Changed by Kovid
import chardet
if _debug:
import chardet.constants
chardet.constants._debug = 1

26
src/chardet/__init__.py Executable file
View File

@ -0,0 +1,26 @@
######################## BEGIN LICENSE BLOCK ########################
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
__version__ = "2.0.1"
def detect(aBuf):
import universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)
u.close()
return u.result

View File

View File

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart
from constants import eStart, eError, eItsMe
class CodingStateMachine:
def __init__(self, sm):

View File

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
import constants, sys
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber):
self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine()
return self.get_state()
return self.get_state()

View File

@ -180,7 +180,7 @@ eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
)
ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
'classFactor': 10,

View File

View File

View File

View File

View File

View File

View File

@ -14,19 +14,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
import constants, sys
from charsetgroupprober import CharSetGroupProber
from sbcharsetprober import SingleByteCharSetProber
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model

View File

20
src/chardet/test.py Executable file
View File

@ -0,0 +1,20 @@
import sys, glob
sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector
count = 0
u = UniversalDetector()
for f in glob.glob(sys.argv[1]):
print f.ljust(60),
u.reset()
for line in file(f, 'rb'):
u.feed(line)
if u.done: break
u.close()
result = u.result
if result['encoding']:
print result['encoding'], 'with confidence', result['confidence']
else:
print '******** no result'
count += 1
print count, 'tests'

View File

@ -81,7 +81,7 @@ class UniversalDetector:
elif aBuf[:4] == '\x00\x00\xFF\xFE':
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
elif aBuf[:4] == '\xFF\xFE':
elif aBuf[:2] == '\xFF\xFE':
# FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif aBuf[:2] == '\xFE\xFF':

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants
import constants, sys
from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine