Split out the chardet library from the calibre package and upgrade it to version 2.0.1

This commit is contained in:
Kovid Goyal 2011-12-26 10:50:54 +05:30
parent b552d25fe0
commit 7233f134c6
38 changed files with 73 additions and 45 deletions

View File

@ -1,34 +1,15 @@
######################## BEGIN LICENSE BLOCK ######################## #!/usr/bin/env python
# This library is free software; you can redistribute it and/or # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
# modify it under the terms of the GNU Lesser General Public from __future__ import (unicode_literals, division, absolute_import,
# License as published by the Free Software Foundation; either print_function)
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
__license__ = 'GPL v3'
__version__ = "1.0" __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, codecs import re, codecs
from chardet import detect
def detect(aBuf):
import calibre.ebooks.chardet.universaldetector as universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)
u.close()
return u.result
# Added by Kovid
ENCODING_PATS = [ ENCODING_PATS = [
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
re.IGNORECASE), re.IGNORECASE),
@ -63,7 +44,8 @@ def force_encoding(raw, verbose, assume_utf8=False):
if chardet['confidence'] < 1 and assume_utf8: if chardet['confidence'] < 1 and assume_utf8:
encoding = 'utf-8' encoding = 'utf-8'
if chardet['confidence'] < 1 and verbose: if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) print('WARNING: Encoding detection confidence %d%%'%(
chardet['confidence']*100))
if not encoding: if not encoding:
encoding = preferred_encoding encoding = preferred_encoding
encoding = encoding.lower() encoding = encoding.lower()
@ -113,7 +95,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
@return: (unicode, encoding used) @return: (unicode, encoding used)
''' '''
if not raw: if not raw:
return u'', None return '', None
raw, encoding = detect_xml_encoding(raw, verbose=verbose, raw, encoding = detect_xml_encoding(raw, verbose=verbose,
assume_utf8=assume_utf8) assume_utf8=assume_utf8)
if not isinstance(raw, unicode): if not isinstance(raw, unicode):

View File

@ -188,7 +188,7 @@ except:
# chardet library auto-detects character encodings # chardet library auto-detects character encodings
# Download from http://chardet.feedparser.org/ # Download from http://chardet.feedparser.org/
try: try:
import calibre.ebooks.chardet as chardet # Changed by Kovid import chardet
if _debug: if _debug:
import chardet.constants import chardet.constants
chardet.constants._debug = 1 chardet.constants._debug = 1

26
src/chardet/__init__.py Executable file
View File

@ -0,0 +1,26 @@
######################## BEGIN LICENSE BLOCK ########################
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
__version__ = "2.0.1"
def detect(aBuf):
import universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)
u.close()
return u.result

View File

View File

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart from constants import eStart, eError, eItsMe
class CodingStateMachine: class CodingStateMachine:
def __init__(self, sm): def __init__(self, sm):

View File

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants import constants, sys
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine from codingstatemachine import CodingStateMachine
@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine() self._mDetectedCharset = codingSM.get_coding_state_machine()
return self.get_state() return self.get_state()
return self.get_state() return self.get_state()

View File

@ -180,7 +180,7 @@ eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
) )
ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0) ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JPSMModel = {'classTable': ISO2022JP_cls, ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
'classFactor': 10, 'classFactor': 10,

View File

View File

View File

View File

View File

View File

View File

@ -14,19 +14,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants import constants, sys
from charsetgroupprober import CharSetGroupProber from charsetgroupprober import CharSetGroupProber
from sbcharsetprober import SingleByteCharSetProber from sbcharsetprober import SingleByteCharSetProber
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model

View File

20
src/chardet/test.py Executable file
View File

@ -0,0 +1,20 @@
import sys, glob
sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector
count = 0
u = UniversalDetector()
for f in glob.glob(sys.argv[1]):
print f.ljust(60),
u.reset()
for line in file(f, 'rb'):
u.feed(line)
if u.done: break
u.close()
result = u.result
if result['encoding']:
print result['encoding'], 'with confidence', result['confidence']
else:
print '******** no result'
count += 1
print count, 'tests'

View File

@ -81,7 +81,7 @@ class UniversalDetector:
elif aBuf[:4] == '\x00\x00\xFF\xFE': elif aBuf[:4] == '\x00\x00\xFF\xFE':
# 00 00 FF FE UCS-4, unusual octet order BOM (2143) # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
elif aBuf[:4] == '\xFF\xFE': elif aBuf[:2] == '\xFF\xFE':
# FF FE UTF-16, little endian BOM # FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
elif aBuf[:2] == '\xFE\xFF': elif aBuf[:2] == '\xFE\xFF':

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants import constants, sys
from constants import eStart, eError, eItsMe from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine from codingstatemachine import CodingStateMachine