diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet.py similarity index 73% rename from src/calibre/ebooks/chardet/__init__.py rename to src/calibre/ebooks/chardet.py index aa49341f01..598fc673a1 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet.py @@ -1,34 +1,15 @@ -######################## BEGIN LICENSE BLOCK ######################## -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA -# 02110-1301 USA -######################### END LICENSE BLOCK ######################### +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) - -__version__ = "1.0" +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' import re, codecs +from chardet import detect -def detect(aBuf): - import calibre.ebooks.chardet.universaldetector as universaldetector - u = universaldetector.UniversalDetector() - u.reset() - u.feed(aBuf) - u.close() - return u.result - -# Added by Kovid ENCODING_PATS = [ re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), @@ -63,7 +44,8 @@ def force_encoding(raw, verbose, assume_utf8=False): if chardet['confidence'] < 1 and assume_utf8: encoding = 'utf-8' if chardet['confidence'] < 1 and verbose: - print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) + print('WARNING: Encoding detection confidence %d%%'%( + chardet['confidence']*100)) if not encoding: encoding = preferred_encoding encoding = encoding.lower() @@ -113,7 +95,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, @return: (unicode, encoding used) ''' if not raw: - return u'', None + return '', None raw, encoding = detect_xml_encoding(raw, verbose=verbose, assume_utf8=assume_utf8) if not isinstance(raw, unicode): diff --git a/src/calibre/web/feeds/feedparser.py b/src/calibre/web/feeds/feedparser.py index 99c3e09666..3fc05988b0 100755 --- a/src/calibre/web/feeds/feedparser.py +++ b/src/calibre/web/feeds/feedparser.py @@ -188,7 +188,7 @@ except: # chardet library auto-detects character encodings # Download from http://chardet.feedparser.org/ try: - import calibre.ebooks.chardet as chardet # Changed by Kovid + import chardet if _debug: import chardet.constants chardet.constants._debug = 1 diff --git a/src/chardet/__init__.py b/src/chardet/__init__.py new file mode 100755 index 0000000000..953b399425 --- /dev/null +++ b/src/chardet/__init__.py @@ -0,0 +1,26 @@ +######################## BEGIN LICENSE BLOCK ######################## +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +__version__ = "2.0.1" + +def detect(aBuf): + import universaldetector + u = universaldetector.UniversalDetector() + u.reset() + u.feed(aBuf) + u.close() + return u.result diff --git a/src/calibre/ebooks/chardet/big5freq.py b/src/chardet/big5freq.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/big5freq.py rename to src/chardet/big5freq.py diff --git a/src/calibre/ebooks/chardet/big5prober.py b/src/chardet/big5prober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/big5prober.py rename to src/chardet/big5prober.py diff --git a/src/calibre/ebooks/chardet/chardistribution.py b/src/chardet/chardistribution.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/chardistribution.py rename to src/chardet/chardistribution.py diff --git a/src/calibre/ebooks/chardet/charsetgroupprober.py b/src/chardet/charsetgroupprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/charsetgroupprober.py rename to src/chardet/charsetgroupprober.py diff --git a/src/calibre/ebooks/chardet/charsetprober.py b/src/chardet/charsetprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/charsetprober.py rename to src/chardet/charsetprober.py diff --git a/src/calibre/ebooks/chardet/codingstatemachine.py b/src/chardet/codingstatemachine.py old mode 100644 new mode 100755 similarity index 97% rename from src/calibre/ebooks/chardet/codingstatemachine.py rename to src/chardet/codingstatemachine.py index 5e759007ea..452d3b0a06 --- a/src/calibre/ebooks/chardet/codingstatemachine.py +++ b/src/chardet/codingstatemachine.py @@ -13,19 +13,19 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart +from constants import eStart, eError, eItsMe class CodingStateMachine: def __init__(self, sm): diff --git a/src/calibre/ebooks/chardet/constants.py b/src/chardet/constants.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/constants.py rename to src/chardet/constants.py diff --git a/src/calibre/ebooks/chardet/escprober.py b/src/chardet/escprober.py old mode 100644 new mode 100755 similarity index 98% rename from src/calibre/ebooks/chardet/escprober.py rename to src/chardet/escprober.py index 5d98b2aad6..572ed7be37 --- a/src/calibre/ebooks/chardet/escprober.py +++ b/src/chardet/escprober.py @@ -13,19 +13,19 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants +import constants, sys from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel from charsetprober import CharSetProber from codingstatemachine import CodingStateMachine @@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber): self._mState = constants.eFoundIt self._mDetectedCharset = codingSM.get_coding_state_machine() return self.get_state() - + return self.get_state() diff --git a/src/calibre/ebooks/chardet/escsm.py b/src/chardet/escsm.py old mode 100644 new mode 100755 similarity index 99% rename from src/calibre/ebooks/chardet/escsm.py rename to src/chardet/escsm.py index 30283593e1..9fa22952e1 --- a/src/calibre/ebooks/chardet/escsm.py +++ b/src/chardet/escsm.py @@ -180,7 +180,7 @@ eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 ) -ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0) +ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ISO2022JPSMModel = {'classTable': ISO2022JP_cls, 'classFactor': 10, diff --git a/src/calibre/ebooks/chardet/eucjpprober.py b/src/chardet/eucjpprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/eucjpprober.py rename to src/chardet/eucjpprober.py diff --git a/src/calibre/ebooks/chardet/euckrfreq.py b/src/chardet/euckrfreq.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/euckrfreq.py rename to src/chardet/euckrfreq.py diff --git a/src/calibre/ebooks/chardet/euckrprober.py b/src/chardet/euckrprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/euckrprober.py rename to src/chardet/euckrprober.py diff --git a/src/calibre/ebooks/chardet/euctwfreq.py b/src/chardet/euctwfreq.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/euctwfreq.py rename to src/chardet/euctwfreq.py diff --git a/src/calibre/ebooks/chardet/euctwprober.py b/src/chardet/euctwprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/euctwprober.py rename to src/chardet/euctwprober.py diff --git a/src/calibre/ebooks/chardet/gb2312freq.py b/src/chardet/gb2312freq.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/gb2312freq.py rename to src/chardet/gb2312freq.py diff --git a/src/calibre/ebooks/chardet/gb2312prober.py b/src/chardet/gb2312prober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/gb2312prober.py rename to src/chardet/gb2312prober.py diff --git a/src/calibre/ebooks/chardet/hebrewprober.py b/src/chardet/hebrewprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/hebrewprober.py rename to src/chardet/hebrewprober.py diff --git a/src/calibre/ebooks/chardet/jisfreq.py b/src/chardet/jisfreq.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/jisfreq.py rename to src/chardet/jisfreq.py diff --git a/src/calibre/ebooks/chardet/jpcntx.py b/src/chardet/jpcntx.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/jpcntx.py rename to src/chardet/jpcntx.py diff --git a/src/calibre/ebooks/chardet/langbulgarianmodel.py b/src/chardet/langbulgarianmodel.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/langbulgarianmodel.py rename to src/chardet/langbulgarianmodel.py diff --git a/src/calibre/ebooks/chardet/langcyrillicmodel.py b/src/chardet/langcyrillicmodel.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/langcyrillicmodel.py rename to src/chardet/langcyrillicmodel.py diff --git a/src/calibre/ebooks/chardet/langgreekmodel.py b/src/chardet/langgreekmodel.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/langgreekmodel.py rename to src/chardet/langgreekmodel.py diff --git a/src/calibre/ebooks/chardet/langhebrewmodel.py b/src/chardet/langhebrewmodel.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/langhebrewmodel.py rename to src/chardet/langhebrewmodel.py diff --git a/src/calibre/ebooks/chardet/langhungarianmodel.py b/src/chardet/langhungarianmodel.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/langhungarianmodel.py rename to src/chardet/langhungarianmodel.py diff --git a/src/calibre/ebooks/chardet/langthaimodel.py b/src/chardet/langthaimodel.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/langthaimodel.py rename to src/chardet/langthaimodel.py diff --git a/src/calibre/ebooks/chardet/latin1prober.py b/src/chardet/latin1prober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/latin1prober.py rename to src/chardet/latin1prober.py diff --git a/src/calibre/ebooks/chardet/mbcharsetprober.py b/src/chardet/mbcharsetprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/mbcharsetprober.py rename to src/chardet/mbcharsetprober.py diff --git a/src/calibre/ebooks/chardet/mbcsgroupprober.py b/src/chardet/mbcsgroupprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/mbcsgroupprober.py rename to src/chardet/mbcsgroupprober.py diff --git a/src/calibre/ebooks/chardet/mbcssm.py b/src/chardet/mbcssm.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/mbcssm.py rename to src/chardet/mbcssm.py diff --git a/src/calibre/ebooks/chardet/sbcharsetprober.py b/src/chardet/sbcharsetprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/sbcharsetprober.py rename to src/chardet/sbcharsetprober.py diff --git a/src/calibre/ebooks/chardet/sbcsgroupprober.py b/src/chardet/sbcsgroupprober.py old mode 100644 new mode 100755 similarity index 99% rename from src/calibre/ebooks/chardet/sbcsgroupprober.py rename to src/chardet/sbcsgroupprober.py index 6269d4c1d8..d19160c86c --- a/src/calibre/ebooks/chardet/sbcsgroupprober.py +++ b/src/chardet/sbcsgroupprober.py @@ -14,19 +14,19 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants +import constants, sys from charsetgroupprober import CharSetGroupProber from sbcharsetprober import SingleByteCharSetProber from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model diff --git a/src/calibre/ebooks/chardet/sjisprober.py b/src/chardet/sjisprober.py old mode 100644 new mode 100755 similarity index 100% rename from src/calibre/ebooks/chardet/sjisprober.py rename to src/chardet/sjisprober.py diff --git a/src/chardet/test.py b/src/chardet/test.py new file mode 100755 index 0000000000..2ebf3a4dcd --- /dev/null +++ b/src/chardet/test.py @@ -0,0 +1,20 @@ +import sys, glob +sys.path.insert(0, '..') +from chardet.universaldetector import UniversalDetector + +count = 0 +u = UniversalDetector() +for f in glob.glob(sys.argv[1]): + print f.ljust(60), + u.reset() + for line in file(f, 'rb'): + u.feed(line) + if u.done: break + u.close() + result = u.result + if result['encoding']: + print result['encoding'], 'with confidence', result['confidence'] + else: + print '******** no result' + count += 1 +print count, 'tests' diff --git a/src/calibre/ebooks/chardet/universaldetector.py b/src/chardet/universaldetector.py old mode 100644 new mode 100755 similarity index 99% rename from src/calibre/ebooks/chardet/universaldetector.py rename to src/chardet/universaldetector.py index 642c2a6e09..809df2276f --- a/src/calibre/ebooks/chardet/universaldetector.py +++ b/src/chardet/universaldetector.py @@ -81,7 +81,7 @@ class UniversalDetector: elif aBuf[:4] == '\x00\x00\xFF\xFE': # 00 00 FF FE UCS-4, unusual octet order BOM (2143) self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} - elif aBuf[:4] == '\xFF\xFE': + elif aBuf[:2] == '\xFF\xFE': # FF FE UTF-16, little endian BOM self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} elif aBuf[:2] == '\xFE\xFF': diff --git a/src/calibre/ebooks/chardet/utf8prober.py b/src/chardet/utf8prober.py old mode 100644 new mode 100755 similarity index 98% rename from src/calibre/ebooks/chardet/utf8prober.py rename to src/chardet/utf8prober.py index 1a1618ecc2..c1792bb377 --- a/src/calibre/ebooks/chardet/utf8prober.py +++ b/src/chardet/utf8prober.py @@ -13,19 +13,19 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants +import constants, sys from constants import eStart, eError, eItsMe from charsetprober import CharSetProber from codingstatemachine import CodingStateMachine