diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 1c61d3e739..551a1f4c04 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -251,16 +251,6 @@ def contractions(col=None): ################################################################################ -if not hasattr(_icu, 'change_case'): - print ('You are running from source with an outdated calibre binary install. You' - ' should update the main calibre binary to at least version 1.28.') - # Dont creak calibre for people running from source until the - # next binary is available witht he update icu module - from calibre.utils.icu_old import * # noqa - - def primary_contains(pat, src): - return primary_find(pat, src)[0] != -1 - if __name__ == '__main__': from calibre.utils.icu_test import run run(verbosity=4) diff --git a/src/calibre/utils/icu_old.py b/src/calibre/utils/icu_old.py deleted file mode 100644 index 39256f6fd6..0000000000 --- a/src/calibre/utils/icu_old.py +++ /dev/null @@ -1,541 +0,0 @@ -#!/usr/bin/env python -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - -__license__ = 'GPL v3' -__copyright__ = '2010, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - -# Setup code {{{ -import sys -from functools import partial - -from calibre.constants import plugins -from calibre.utils.config_base import tweaks - -_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None -_locale = None - -_none = u'' -_none2 = b'' - -def get_locale(): - global _locale - if _locale is None: - from calibre.utils.localization import get_lang - if tweaks['locale_for_sorting']: - _locale = tweaks['locale_for_sorting'] - else: - _locale = get_lang() - return _locale - -def load_icu(): - global _icu - if _icu is None: - _icu = plugins['icu'][0] - if _icu is None: - print 'Loading ICU failed with: ', plugins['icu'][1] - else: - if not getattr(_icu, 'ok', False): - print 'icu not ok' - _icu = None - return _icu - -def load_collator(): - 'The default collator for most locales takes both case and accented letters into account' - global _collator - if _collator is None: - icu = load_icu() - if icu is not None: - _collator = icu.Collator(get_locale()) - return _collator - -def primary_collator(): - 'Ignores case differences and accented characters' - global _primary_collator - if _primary_collator is None: - _primary_collator = _collator.clone() - _primary_collator.strength = _icu.UCOL_PRIMARY - return _primary_collator - -def sort_collator(): - 'Ignores case differences and recognizes numbers in strings' - global _sort_collator - if _sort_collator is None: - _sort_collator = _collator.clone() - _sort_collator.strength = _icu.UCOL_SECONDARY - if tweaks['numeric_collation']: - try: - _sort_collator.numeric = True - except AttributeError: - pass - return _sort_collator - -def py_sort_key(obj): - if not obj: - return _none - return obj.lower() - -def icu_sort_key(collator, obj): - if not obj: - return _none2 - try: - try: - return _sort_collator.sort_key(obj) - except AttributeError: - return sort_collator().sort_key(obj) - except TypeError: - if isinstance(obj, unicode): - obj = obj.replace(u'\0', u'') - else: - obj = obj.replace(b'\0', b'') - return _sort_collator.sort_key(obj) - -def numeric_collator(): - global _numeric_collator - _numeric_collator = _collator.clone() - _numeric_collator.strength = _icu.UCOL_SECONDARY - _numeric_collator.numeric = True - return _numeric_collator - -def numeric_sort_key(obj): - 'Uses natural sorting for numbers inside strings so something2 will sort before something10' - if not obj: - return _none2 - try: - try: - return _numeric_collator.sort_key(obj) - except AttributeError: - return numeric_collator().sort_key(obj) - except TypeError: - if isinstance(obj, unicode): - obj = obj.replace(u'\0', u'') - else: - obj = obj.replace(b'\0', b'') - return _numeric_collator.sort_key(obj) - -def icu_change_case(upper, locale, obj): - func = _icu.upper if upper else _icu.lower - try: - return func(locale, obj) - except TypeError: - if isinstance(obj, unicode): - obj = obj.replace(u'\0', u'') - else: - obj = obj.replace(b'\0', b'') - return func(locale, obj) - -def py_find(pattern, source): - pos = source.find(pattern) - if pos > -1: - return pos, len(pattern) - return -1, -1 - -def character_name(string): - try: - try: - return _icu.character_name(unicode(string)) or None - except AttributeError: - import unicodedata - return unicodedata.name(unicode(string)[0], None) - except (TypeError, ValueError, KeyError): - pass - -def character_name_from_code(code): - try: - try: - return _icu.character_name_from_code(code) or '' - except AttributeError: - import unicodedata - return unicodedata.name(py_safe_chr(code), '') - except (TypeError, ValueError, KeyError): - return '' - -if sys.maxunicode >= 0x10ffff: - try: - py_safe_chr = unichr - except NameError: - py_safe_chr = chr -else: - def py_safe_chr(i): - # Narrow builds of python cannot represent code point > 0xffff as a - # single character, so we need our own implementation of unichr - # that returns them as a surrogate pair - return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape') - -def safe_chr(code): - try: - return _icu.chr(code) - except AttributeError: - return py_safe_chr(code) - -def normalize(text, mode='NFC'): - # This is very slightly slower than using unicodedata.normalize, so stick with - # that unless you have very good reasons not too. Also, it's speed - # decreases on wide python builds, where conversion to/from ICU's string - # representation is slower. - try: - return _icu.normalize(_nmodes[mode], unicode(text)) - except (AttributeError, KeyError): - import unicodedata - return unicodedata.normalize(mode, unicode(text)) - -def icu_find(collator, pattern, source): - try: - return collator.find(pattern, source) - except TypeError: - return collator.find(unicode(pattern), unicode(source)) - -def icu_startswith(collator, a, b): - try: - return collator.startswith(a, b) - except TypeError: - return collator.startswith(unicode(a), unicode(b)) - -def py_case_sensitive_sort_key(obj): - if not obj: - return _none - return obj - -def icu_case_sensitive_sort_key(collator, obj): - if not obj: - return _none2 - return collator.sort_key(obj) - -def icu_strcmp(collator, a, b): - return collator.strcmp(lower(a), lower(b)) - -def py_strcmp(a, b): - return cmp(a.lower(), b.lower()) - -def icu_case_sensitive_strcmp(collator, a, b): - return collator.strcmp(a, b) - -def icu_capitalize(s): - s = lower(s) - return s.replace(s[0], upper(s[0]), 1) if s else s - -_cmap = {} -def icu_contractions(collator): - global _cmap - ans = _cmap.get(collator, None) - if ans is None: - ans = collator.contractions() - ans = frozenset(filter(None, ans)) if ans else {} - _cmap[collator] = ans - return ans - -def icu_collation_order(collator, a): - try: - return collator.collation_order(a) - except TypeError: - return collator.collation_order(unicode(a)) - -load_icu() -load_collator() -_icu_not_ok = _icu is None or _collator is None -icu_unicode_version = getattr(_icu, 'unicode_version', None) -_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')} - -try: - senc = sys.getdefaultencoding() - if not senc or senc.lower() == 'ascii': - _icu.set_default_encoding('utf-8') - del senc -except: - pass - -try: - fenc = sys.getfilesystemencoding() - if not fenc or fenc.lower() == 'ascii': - _icu.set_filesystem_encoding('utf-8') - del fenc -except: - pass - - -# }}} - -################# The string functions ######################################## - -sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator) - -strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator) - -case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \ - partial(icu_case_sensitive_sort_key, _collator) - -case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp - -upper = (lambda s: s.upper()) if _icu_not_ok else \ - partial(icu_change_case, True, get_locale()) - -lower = (lambda s: s.lower()) if _icu_not_ok else \ - partial(icu_change_case, False, get_locale()) - -title_case = (lambda s: s.title()) if _icu_not_ok else \ - partial(_icu.title, get_locale()) - -capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \ - (lambda s: icu_capitalize(s)) - -find = (py_find if _icu_not_ok else partial(icu_find, _collator)) - -contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions, - _collator))) - -def primary_strcmp(a, b): - 'strcmp that ignores case and accents on letters' - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return py_strcmp(ascii_text(a), ascii_text(b)) - try: - return _primary_collator.strcmp(a, b) - except AttributeError: - return primary_collator().strcmp(a, b) - -def primary_find(pat, src): - 'find that ignores case and accents on letters' - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return py_find(ascii_text(pat), ascii_text(src)) - return primary_icu_find(pat, src) - -def primary_icu_find(pat, src): - try: - return icu_find(_primary_collator, pat, src) - except AttributeError: - return icu_find(primary_collator(), pat, src) - -def primary_sort_key(val): - 'A sort key that ignores case and diacritics' - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return ascii_text(val).lower() - try: - return _primary_collator.sort_key(val) - except AttributeError: - return primary_collator().sort_key(val) - -def primary_startswith(a, b): - if _icu_not_ok: - from calibre.utils.filenames import ascii_text - return ascii_text(a).lower().startswith(ascii_text(b).lower()) - try: - return icu_startswith(_primary_collator, a, b) - except AttributeError: - return icu_startswith(primary_collator(), a, b) - -def collation_order(a): - if _icu_not_ok: - return (ord(a[0]), 1) if a else (0, 0) - try: - return icu_collation_order(_sort_collator, a) - except AttributeError: - return icu_collation_order(sort_collator(), a) - -################################################################################ - -def test(): # {{{ - from calibre import prints - # Data {{{ - german = ''' - Sonntag -Montag -Dienstag -Januar -Februar -März -Fuße -Fluße -Flusse -flusse -fluße -flüße -flüsse -''' - german_good = ''' - Dienstag -Februar -flusse -Flusse -fluße -Fluße -flüsse -flüße -Fuße -Januar -März -Montag -Sonntag''' - french = ''' -dimanche -lundi -mardi -janvier -février -mars -déjà -Meme -deja -même -dejà -bpef -bœg -Boef -Mémé -bœf -boef -bnef -pêche -pèché -pêché -pêche -pêché''' - french_good = ''' - bnef - boef - Boef - bœf - bœg - bpef - deja - dejà - déjà - dimanche - février - janvier - lundi - mardi - mars - Meme - Mémé - même - pèché - pêche - pêche - pêché - pêché''' - # }}} - - def create(l): - l = l.decode('utf-8').splitlines() - return [x.strip() for x in l if x.strip()] - - def test_strcmp(entries): - for x in entries: - for y in entries: - if strcmp(x, y) != cmp(sort_key(x), sort_key(y)): - print 'strcmp failed for %r, %r'%(x, y) - - german = create(german) - c = _icu.Collator('de') - c.numeric = True - gs = list(sorted(german, key=c.sort_key)) - if gs != create(german_good): - print 'German sorting failed' - return - print - french = create(french) - c = _icu.Collator('fr') - c.numeric = True - fs = list(sorted(french, key=c.sort_key)) - if fs != create(french_good): - print 'French sorting failed (note that French fails with icu < 4.6)' - return - test_strcmp(german + french) - - print '\nTesting case transforms in current locale' - from calibre.utils.titlecase import titlecase - for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'): - print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8') - print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8') - print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8') - print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8') - print - - print '\nTesting primary collation' - for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', - u'Štepánek':u'ŠtepaneK'}.iteritems(): - if primary_strcmp(k, v) != 0: - prints('primary_strcmp() failed with %s != %s'%(k, v)) - return - if primary_find(v, u' '+k)[0] != 1: - prints('primary_find() failed with %s not in %s'%(v, k)) - return - - n = character_name(safe_chr(0x1f431)) - if n != u'CAT FACE': - raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE') - - global _primary_collator - orig = _primary_collator - _primary_collator = _icu.Collator('es') - if primary_strcmp(u'peña', u'pena') == 0: - print 'Primary collation in Spanish locale failed' - return - _primary_collator = orig - - print '\nTesting contractions' - c = _icu.Collator('cs') - if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch', - u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH', - u'S\u030c', u'R\u030c']): - print 'Contractions for the Czech language failed' - return - - print '\nTesting startswith' - p = primary_startswith - if (not p('asd', 'asd') or not p('asd', 'A') or - not p('x', '')): - print 'startswith() failed' - return - - print '\nTesting collation_order()' - for group in [ - ('Šaa', 'Smith', 'Solženicyn', 'Štepánek'), - ('calibre', 'Charon', 'Collins'), - ('01', '1'), - ('1', '11', '13'), - ]: - last = None - for x in group: - val = icu_collation_order(sort_collator(), x) - if val[1] != 1: - prints('collation_order() returned incorrect length for', x) - if last is None: - last = val - else: - if val != last: - prints('collation_order() returned incorrect value for', x) - last = val - -# }}} - -def test_roundtrip(): - for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'): - rp = _icu.roundtrip(r) - if rp != r: - raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp)) - -def test_normalize_performance(): - import os - if not os.path.exists('t.txt'): - return - raw = open('t.txt', 'rb').read().decode('utf-8') - print (len(raw)) - import time, unicodedata - st = time.time() - count = 100 - for i in xrange(count): - normalize(raw) - print ('ICU time:', time.time() - st) - st = time.time() - for i in xrange(count): - unicodedata.normalize('NFC', unicode(raw)) - print ('py time:', time.time() - st) - -if __name__ == '__main__': - test_roundtrip() - test_normalize_performance() - test() -