Split out the chardet library from the calibre package and upgrade it to version 2.0.1

2026-01-05 03:30:20 -05:00 · 2011-12-26 10:50:54 +05:30 · 2011-12-26 10:50:54 +05:30 · 7233f134c6
commit 7233f134c6
parent b552d25fe0
38 changed files with 73 additions and 45 deletions
--- a/src/calibre/ebooks/chardet/init.py
+++ b/src/calibre/ebooks/chardet/init.py
@ -1,34 +1,15 @@
-######################## BEGIN LICENSE BLOCK ########################
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
-# 02110-1301  USA
-######################### END LICENSE BLOCK #########################
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)

-
-__version__ = "1.0"
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'

 import re, codecs
+from chardet import detect

-def detect(aBuf):
-    import calibre.ebooks.chardet.universaldetector as universaldetector
-    u = universaldetector.UniversalDetector()
-    u.reset()
-    u.feed(aBuf)
-    u.close()
-    return u.result
-
-# Added by Kovid
 ENCODING_PATS = [
                 re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
                            re.IGNORECASE),
@ -63,7 +44,8 @@ def force_encoding(raw, verbose, assume_utf8=False):
    if chardet['confidence'] < 1 and assume_utf8:
        encoding = 'utf-8'
    if chardet['confidence'] < 1 and verbose:
-        print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
+        print('WARNING: Encoding detection confidence %d%%'%(
+            chardet['confidence']*100))
    if not encoding:
        encoding = preferred_encoding
    encoding = encoding.lower()
@ -113,7 +95,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
    @return: (unicode, encoding used)
    '''
    if not raw:
-        return u'', None
+        return '', None
    raw, encoding = detect_xml_encoding(raw, verbose=verbose,
            assume_utf8=assume_utf8)
    if not isinstance(raw, unicode):
--- a/src/calibre/web/feeds/feedparser.py
+++ b/src/calibre/web/feeds/feedparser.py
@ -188,7 +188,7 @@ except:
 # chardet library auto-detects character encodings
 # Download from http://chardet.feedparser.org/
 try:
-    import calibre.ebooks.chardet as chardet # Changed by Kovid
+    import chardet
    if _debug:
        import chardet.constants
        chardet.constants._debug = 1
--- a/src/chardet/init.py
+++ b/src/chardet/init.py
@ -0,0 +1,26 @@
+######################## BEGIN LICENSE BLOCK ########################
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# 
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+# 
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301  USA
+######################### END LICENSE BLOCK #########################
+
+__version__ = "2.0.1"
+
+def detect(aBuf):
+    import universaldetector
+    u = universaldetector.UniversalDetector()
+    u.reset()
+    u.feed(aBuf)
+    u.close()
+    return u.result
--- a/src/calibre/ebooks/chardet/big5freq.py
+++ b/src/calibre/ebooks/chardet/big5freq.py
--- a/src/calibre/ebooks/chardet/big5prober.py
+++ b/src/calibre/ebooks/chardet/big5prober.py
--- a/src/calibre/ebooks/chardet/chardistribution.py
+++ b/src/calibre/ebooks/chardet/chardistribution.py
--- a/src/calibre/ebooks/chardet/charsetgroupprober.py
+++ b/src/calibre/ebooks/chardet/charsetgroupprober.py
--- a/src/calibre/ebooks/chardet/charsetprober.py
+++ b/src/calibre/ebooks/chardet/charsetprober.py
--- a/src/calibre/ebooks/chardet/codingstatemachine.py
+++ b/src/calibre/ebooks/chardet/codingstatemachine.py
@ -13,19 +13,19 @@
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
-#
+# 
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
-#
+# 
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-from constants import eStart
+from constants import eStart, eError, eItsMe

 class CodingStateMachine:
    def __init__(self, sm):
--- a/src/calibre/ebooks/chardet/constants.py
+++ b/src/calibre/ebooks/chardet/constants.py
--- a/src/calibre/ebooks/chardet/escprober.py
+++ b/src/calibre/ebooks/chardet/escprober.py
@ -13,19 +13,19 @@
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
-#
+# 
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
-#
+# 
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+import constants, sys
 from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
 from charsetprober import CharSetProber
 from codingstatemachine import CodingStateMachine
@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber):
                    self._mState = constants.eFoundIt
                    self._mDetectedCharset = codingSM.get_coding_state_machine()
                    return self.get_state()
-
+                
        return self.get_state()
--- a/src/calibre/ebooks/chardet/escsm.py
+++ b/src/calibre/ebooks/chardet/escsm.py
@ -180,7 +180,7 @@ eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
 eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 
 )

-ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0)
+ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

 ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
                    'classFactor': 10,
--- a/src/calibre/ebooks/chardet/eucjpprober.py
+++ b/src/calibre/ebooks/chardet/eucjpprober.py
--- a/src/calibre/ebooks/chardet/euckrfreq.py
+++ b/src/calibre/ebooks/chardet/euckrfreq.py
--- a/src/calibre/ebooks/chardet/euckrprober.py
+++ b/src/calibre/ebooks/chardet/euckrprober.py
--- a/src/calibre/ebooks/chardet/euctwfreq.py
+++ b/src/calibre/ebooks/chardet/euctwfreq.py
--- a/src/calibre/ebooks/chardet/euctwprober.py
+++ b/src/calibre/ebooks/chardet/euctwprober.py
--- a/src/calibre/ebooks/chardet/gb2312freq.py
+++ b/src/calibre/ebooks/chardet/gb2312freq.py
--- a/src/calibre/ebooks/chardet/gb2312prober.py
+++ b/src/calibre/ebooks/chardet/gb2312prober.py
--- a/src/calibre/ebooks/chardet/hebrewprober.py
+++ b/src/calibre/ebooks/chardet/hebrewprober.py
--- a/src/calibre/ebooks/chardet/jisfreq.py
+++ b/src/calibre/ebooks/chardet/jisfreq.py
--- a/src/calibre/ebooks/chardet/jpcntx.py
+++ b/src/calibre/ebooks/chardet/jpcntx.py
--- a/src/calibre/ebooks/chardet/langbulgarianmodel.py
+++ b/src/calibre/ebooks/chardet/langbulgarianmodel.py
--- a/src/calibre/ebooks/chardet/langcyrillicmodel.py
+++ b/src/calibre/ebooks/chardet/langcyrillicmodel.py
--- a/src/calibre/ebooks/chardet/langgreekmodel.py
+++ b/src/calibre/ebooks/chardet/langgreekmodel.py
--- a/src/calibre/ebooks/chardet/langhebrewmodel.py
+++ b/src/calibre/ebooks/chardet/langhebrewmodel.py
--- a/src/calibre/ebooks/chardet/langhungarianmodel.py
+++ b/src/calibre/ebooks/chardet/langhungarianmodel.py
--- a/src/calibre/ebooks/chardet/langthaimodel.py
+++ b/src/calibre/ebooks/chardet/langthaimodel.py
--- a/src/calibre/ebooks/chardet/latin1prober.py
+++ b/src/calibre/ebooks/chardet/latin1prober.py
--- a/src/calibre/ebooks/chardet/mbcharsetprober.py
+++ b/src/calibre/ebooks/chardet/mbcharsetprober.py
--- a/src/calibre/ebooks/chardet/mbcsgroupprober.py
+++ b/src/calibre/ebooks/chardet/mbcsgroupprober.py
--- a/src/calibre/ebooks/chardet/mbcssm.py
+++ b/src/calibre/ebooks/chardet/mbcssm.py
--- a/src/calibre/ebooks/chardet/sbcharsetprober.py
+++ b/src/calibre/ebooks/chardet/sbcharsetprober.py
--- a/src/calibre/ebooks/chardet/sbcsgroupprober.py
+++ b/src/calibre/ebooks/chardet/sbcsgroupprober.py
@ -14,19 +14,19 @@
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
-#
+# 
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
-#
+# 
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+import constants, sys
 from charsetgroupprober import CharSetGroupProber
 from sbcharsetprober import SingleByteCharSetProber
 from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
--- a/src/calibre/ebooks/chardet/sjisprober.py
+++ b/src/calibre/ebooks/chardet/sjisprober.py
--- a/src/chardet/test.py
+++ b/src/chardet/test.py
@ -0,0 +1,20 @@
+import sys, glob
+sys.path.insert(0, '..')
+from chardet.universaldetector import UniversalDetector
+
+count = 0
+u = UniversalDetector()
+for f in glob.glob(sys.argv[1]):
+    print f.ljust(60),
+    u.reset()
+    for line in file(f, 'rb'):
+        u.feed(line)
+        if u.done: break
+    u.close()
+    result = u.result
+    if result['encoding']:
+        print result['encoding'], 'with confidence', result['confidence']
+    else:
+        print '******** no result'
+    count += 1
+print count, 'tests'
--- a/src/calibre/ebooks/chardet/universaldetector.py
+++ b/src/calibre/ebooks/chardet/universaldetector.py
@ -81,7 +81,7 @@ class UniversalDetector:
            elif aBuf[:4] == '\x00\x00\xFF\xFE':
                # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
-            elif aBuf[:4] == '\xFF\xFE':
+            elif aBuf[:2] == '\xFF\xFE':
                # FF FE  UTF-16, little endian BOM
                self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
            elif aBuf[:2] == '\xFE\xFF':
--- a/src/calibre/ebooks/chardet/utf8prober.py
+++ b/src/calibre/ebooks/chardet/utf8prober.py
@ -13,19 +13,19 @@
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
-#
+# 
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
-#
+# 
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################

-import constants
+import constants, sys
 from constants import eStart, eError, eItsMe
 from charsetprober import CharSetProber
 from codingstatemachine import CodingStateMachine