diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index f6973c72a6..6f335a5434 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -247,6 +247,16 @@ def contractions(col=None): ################################################################################ +if not hasattr(_icu, 'change_case'): + print ('You are running from source with an outdated calibre binary install. You' + ' should update the main calibre binary to at least version 1.28.') + # Dont creak calibre for people running from source until the + # next binary is available witht he update icu module + from calibre.utils.icu_old import * # noqa + + def primary_contains(pat, src): + return primary_find(pat, src)[0] != -1 + if __name__ == '__main__': from calibre.utils.icu_test import run run(verbosity=4) diff --git a/src/calibre/utils/icu_old.py b/src/calibre/utils/icu_old.py new file mode 100644 index 0000000000..39256f6fd6 --- /dev/null +++ b/src/calibre/utils/icu_old.py @@ -0,0 +1,541 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +# Setup code {{{ +import sys +from functools import partial + +from calibre.constants import plugins +from calibre.utils.config_base import tweaks + +_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None +_locale = None + +_none = u'' +_none2 = b'' + +def get_locale(): + global _locale + if _locale is None: + from calibre.utils.localization import get_lang + if tweaks['locale_for_sorting']: + _locale = tweaks['locale_for_sorting'] + else: + _locale = get_lang() + return _locale + +def load_icu(): + global _icu + if _icu is None: + _icu = plugins['icu'][0] + if _icu is None: + print 'Loading ICU failed with: ', plugins['icu'][1] + else: + if not getattr(_icu, 'ok', False): + print 'icu not ok' + _icu = None + return _icu + +def load_collator(): + 'The default collator for most locales takes both case and accented letters into account' + global _collator + if _collator is None: + icu = load_icu() + if icu is not None: + _collator = icu.Collator(get_locale()) + return _collator + +def primary_collator(): + 'Ignores case differences and accented characters' + global _primary_collator + if _primary_collator is None: + _primary_collator = _collator.clone() + _primary_collator.strength = _icu.UCOL_PRIMARY + return _primary_collator + +def sort_collator(): + 'Ignores case differences and recognizes numbers in strings' + global _sort_collator + if _sort_collator is None: + _sort_collator = _collator.clone() + _sort_collator.strength = _icu.UCOL_SECONDARY + if tweaks['numeric_collation']: + try: + _sort_collator.numeric = True + except AttributeError: + pass + return _sort_collator + +def py_sort_key(obj): + if not obj: + return _none + return obj.lower() + +def icu_sort_key(collator, obj): + if not obj: + return _none2 + try: + try: + return _sort_collator.sort_key(obj) + except AttributeError: + return sort_collator().sort_key(obj) + except TypeError: + if isinstance(obj, unicode): + obj = obj.replace(u'\0', u'') + else: + obj = obj.replace(b'\0', b'') + return _sort_collator.sort_key(obj) + +def numeric_collator(): + global _numeric_collator + _numeric_collator = _collator.clone() + _numeric_collator.strength = _icu.UCOL_SECONDARY + _numeric_collator.numeric = True + return _numeric_collator + +def numeric_sort_key(obj): + 'Uses natural sorting for numbers inside strings so something2 will sort before something10' + if not obj: + return _none2 + try: + try: + return _numeric_collator.sort_key(obj) + except AttributeError: + return numeric_collator().sort_key(obj) + except TypeError: + if isinstance(obj, unicode): + obj = obj.replace(u'\0', u'') + else: + obj = obj.replace(b'\0', b'') + return _numeric_collator.sort_key(obj) + +def icu_change_case(upper, locale, obj): + func = _icu.upper if upper else _icu.lower + try: + return func(locale, obj) + except TypeError: + if isinstance(obj, unicode): + obj = obj.replace(u'\0', u'') + else: + obj = obj.replace(b'\0', b'') + return func(locale, obj) + +def py_find(pattern, source): + pos = source.find(pattern) + if pos > -1: + return pos, len(pattern) + return -1, -1 + +def character_name(string): + try: + try: + return _icu.character_name(unicode(string)) or None + except AttributeError: + import unicodedata + return unicodedata.name(unicode(string)[0], None) + except (TypeError, ValueError, KeyError): + pass + +def character_name_from_code(code): + try: + try: + return _icu.character_name_from_code(code) or '' + except AttributeError: + import unicodedata + return unicodedata.name(py_safe_chr(code), '') + except (TypeError, ValueError, KeyError): + return '' + +if sys.maxunicode >= 0x10ffff: + try: + py_safe_chr = unichr + except NameError: + py_safe_chr = chr +else: + def py_safe_chr(i): + # Narrow builds of python cannot represent code point > 0xffff as a + # single character, so we need our own implementation of unichr + # that returns them as a surrogate pair + return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape') + +def safe_chr(code): + try: + return _icu.chr(code) + except AttributeError: + return py_safe_chr(code) + +def normalize(text, mode='NFC'): + # This is very slightly slower than using unicodedata.normalize, so stick with + # that unless you have very good reasons not too. Also, it's speed + # decreases on wide python builds, where conversion to/from ICU's string + # representation is slower. + try: + return _icu.normalize(_nmodes[mode], unicode(text)) + except (AttributeError, KeyError): + import unicodedata + return unicodedata.normalize(mode, unicode(text)) + +def icu_find(collator, pattern, source): + try: + return collator.find(pattern, source) + except TypeError: + return collator.find(unicode(pattern), unicode(source)) + +def icu_startswith(collator, a, b): + try: + return collator.startswith(a, b) + except TypeError: + return collator.startswith(unicode(a), unicode(b)) + +def py_case_sensitive_sort_key(obj): + if not obj: + return _none + return obj + +def icu_case_sensitive_sort_key(collator, obj): + if not obj: + return _none2 + return collator.sort_key(obj) + +def icu_strcmp(collator, a, b): + return collator.strcmp(lower(a), lower(b)) + +def py_strcmp(a, b): + return cmp(a.lower(), b.lower()) + +def icu_case_sensitive_strcmp(collator, a, b): + return collator.strcmp(a, b) + +def icu_capitalize(s): + s = lower(s) + return s.replace(s[0], upper(s[0]), 1) if s else s + +_cmap = {} +def icu_contractions(collator): + global _cmap + ans = _cmap.get(collator, None) + if ans is None: + ans = collator.contractions() + ans = frozenset(filter(None, ans)) if ans else {} + _cmap[collator] = ans + return ans + +def icu_collation_order(collator, a): + try: + return collator.collation_order(a) + except TypeError: + return collator.collation_order(unicode(a)) + +load_icu() +load_collator() +_icu_not_ok = _icu is None or _collator is None +icu_unicode_version = getattr(_icu, 'unicode_version', None) +_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')} + +try: + senc = sys.getdefaultencoding() + if not senc or senc.lower() == 'ascii': + _icu.set_default_encoding('utf-8') + del senc +except: + pass + +try: + fenc = sys.getfilesystemencoding() + if not fenc or fenc.lower() == 'ascii': + _icu.set_filesystem_encoding('utf-8') + del fenc +except: + pass + + +# }}} + +################# The string functions ######################################## + +sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator) + +strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator) + +case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \ + partial(icu_case_sensitive_sort_key, _collator) + +case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp + +upper = (lambda s: s.upper()) if _icu_not_ok else \ + partial(icu_change_case, True, get_locale()) + +lower = (lambda s: s.lower()) if _icu_not_ok else \ + partial(icu_change_case, False, get_locale()) + +title_case = (lambda s: s.title()) if _icu_not_ok else \ + partial(_icu.title, get_locale()) + +capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \ + (lambda s: icu_capitalize(s)) + +find = (py_find if _icu_not_ok else partial(icu_find, _collator)) + +contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions, + _collator))) + +def primary_strcmp(a, b): + 'strcmp that ignores case and accents on letters' + if _icu_not_ok: + from calibre.utils.filenames import ascii_text + return py_strcmp(ascii_text(a), ascii_text(b)) + try: + return _primary_collator.strcmp(a, b) + except AttributeError: + return primary_collator().strcmp(a, b) + +def primary_find(pat, src): + 'find that ignores case and accents on letters' + if _icu_not_ok: + from calibre.utils.filenames import ascii_text + return py_find(ascii_text(pat), ascii_text(src)) + return primary_icu_find(pat, src) + +def primary_icu_find(pat, src): + try: + return icu_find(_primary_collator, pat, src) + except AttributeError: + return icu_find(primary_collator(), pat, src) + +def primary_sort_key(val): + 'A sort key that ignores case and diacritics' + if _icu_not_ok: + from calibre.utils.filenames import ascii_text + return ascii_text(val).lower() + try: + return _primary_collator.sort_key(val) + except AttributeError: + return primary_collator().sort_key(val) + +def primary_startswith(a, b): + if _icu_not_ok: + from calibre.utils.filenames import ascii_text + return ascii_text(a).lower().startswith(ascii_text(b).lower()) + try: + return icu_startswith(_primary_collator, a, b) + except AttributeError: + return icu_startswith(primary_collator(), a, b) + +def collation_order(a): + if _icu_not_ok: + return (ord(a[0]), 1) if a else (0, 0) + try: + return icu_collation_order(_sort_collator, a) + except AttributeError: + return icu_collation_order(sort_collator(), a) + +################################################################################ + +def test(): # {{{ + from calibre import prints + # Data {{{ + german = ''' + Sonntag +Montag +Dienstag +Januar +Februar +März +Fuße +Fluße +Flusse +flusse +fluße +flüße +flüsse +''' + german_good = ''' + Dienstag +Februar +flusse +Flusse +fluße +Fluße +flüsse +flüße +Fuße +Januar +März +Montag +Sonntag''' + french = ''' +dimanche +lundi +mardi +janvier +février +mars +déjà +Meme +deja +même +dejà +bpef +bœg +Boef +Mémé +bœf +boef +bnef +pêche +pèché +pêché +pêche +pêché''' + french_good = ''' + bnef + boef + Boef + bœf + bœg + bpef + deja + dejà + déjà + dimanche + février + janvier + lundi + mardi + mars + Meme + Mémé + même + pèché + pêche + pêche + pêché + pêché''' + # }}} + + def create(l): + l = l.decode('utf-8').splitlines() + return [x.strip() for x in l if x.strip()] + + def test_strcmp(entries): + for x in entries: + for y in entries: + if strcmp(x, y) != cmp(sort_key(x), sort_key(y)): + print 'strcmp failed for %r, %r'%(x, y) + + german = create(german) + c = _icu.Collator('de') + c.numeric = True + gs = list(sorted(german, key=c.sort_key)) + if gs != create(german_good): + print 'German sorting failed' + return + print + french = create(french) + c = _icu.Collator('fr') + c.numeric = True + fs = list(sorted(french, key=c.sort_key)) + if fs != create(french_good): + print 'French sorting failed (note that French fails with icu < 4.6)' + return + test_strcmp(german + french) + + print '\nTesting case transforms in current locale' + from calibre.utils.titlecase import titlecase + for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'): + print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8') + print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8') + print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8') + print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8') + print + + print '\nTesting primary collation' + for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', + u'Štepánek':u'ŠtepaneK'}.iteritems(): + if primary_strcmp(k, v) != 0: + prints('primary_strcmp() failed with %s != %s'%(k, v)) + return + if primary_find(v, u' '+k)[0] != 1: + prints('primary_find() failed with %s not in %s'%(v, k)) + return + + n = character_name(safe_chr(0x1f431)) + if n != u'CAT FACE': + raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE') + + global _primary_collator + orig = _primary_collator + _primary_collator = _icu.Collator('es') + if primary_strcmp(u'peña', u'pena') == 0: + print 'Primary collation in Spanish locale failed' + return + _primary_collator = orig + + print '\nTesting contractions' + c = _icu.Collator('cs') + if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch', + u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH', + u'S\u030c', u'R\u030c']): + print 'Contractions for the Czech language failed' + return + + print '\nTesting startswith' + p = primary_startswith + if (not p('asd', 'asd') or not p('asd', 'A') or + not p('x', '')): + print 'startswith() failed' + return + + print '\nTesting collation_order()' + for group in [ + ('Šaa', 'Smith', 'Solženicyn', 'Štepánek'), + ('calibre', 'Charon', 'Collins'), + ('01', '1'), + ('1', '11', '13'), + ]: + last = None + for x in group: + val = icu_collation_order(sort_collator(), x) + if val[1] != 1: + prints('collation_order() returned incorrect length for', x) + if last is None: + last = val + else: + if val != last: + prints('collation_order() returned incorrect value for', x) + last = val + +# }}} + +def test_roundtrip(): + for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'): + rp = _icu.roundtrip(r) + if rp != r: + raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp)) + +def test_normalize_performance(): + import os + if not os.path.exists('t.txt'): + return + raw = open('t.txt', 'rb').read().decode('utf-8') + print (len(raw)) + import time, unicodedata + st = time.time() + count = 100 + for i in xrange(count): + normalize(raw) + print ('ICU time:', time.time() - st) + st = time.time() + for i in xrange(count): + unicodedata.normalize('NFC', unicode(raw)) + print ('py time:', time.time() - st) + +if __name__ == '__main__': + test_roundtrip() + test_normalize_performance() + test() +