Allow merging of icu branch into trunk by falling back to the old icu module if the old binary plugin is detected.

2025-07-09 03:04:10 -04:00 · 2014-03-08 22:18:29 +05:30 · 2014-03-08 22:18:29 +05:30 · 1f2aa8a55b
commit 1f2aa8a55b
parent b76cc3e9ab
2 changed files with 551 additions and 0 deletions
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -247,6 +247,16 @@ def contractions(col=None):

 ################################################################################

+if not hasattr(_icu, 'change_case'):
+    print ('You are running from source with an outdated calibre binary install. You'
+           ' should update the main calibre binary to at least version 1.28.')
+    # Dont creak calibre for people running from source until the
+    # next binary is available witht he update icu module
+    from calibre.utils.icu_old import *  # noqa
+
+    def primary_contains(pat, src):
+        return primary_find(pat, src)[0] != -1
+
 if __name__ == '__main__':
    from calibre.utils.icu_test import run
    run(verbosity=4)
--- a/src/calibre/utils/icu_old.py
+++ b/src/calibre/utils/icu_old.py
@ -0,0 +1,541 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+# Setup code {{{
+import sys
+from functools import partial
+
+from calibre.constants import plugins
+from calibre.utils.config_base import tweaks
+
+_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
+_locale = None
+
+_none = u''
+_none2 = b''
+
+def get_locale():
+    global _locale
+    if _locale is None:
+        from calibre.utils.localization import get_lang
+        if tweaks['locale_for_sorting']:
+            _locale = tweaks['locale_for_sorting']
+        else:
+            _locale = get_lang()
+    return _locale
+
+def load_icu():
+    global _icu
+    if _icu is None:
+        _icu = plugins['icu'][0]
+        if _icu is None:
+            print 'Loading ICU failed with: ', plugins['icu'][1]
+        else:
+            if not getattr(_icu, 'ok', False):
+                print 'icu not ok'
+                _icu = None
+    return _icu
+
+def load_collator():
+    'The default collator for most locales takes both case and accented letters into account'
+    global _collator
+    if _collator is None:
+        icu = load_icu()
+        if icu is not None:
+            _collator = icu.Collator(get_locale())
+    return _collator
+
+def primary_collator():
+    'Ignores case differences and accented characters'
+    global _primary_collator
+    if _primary_collator is None:
+        _primary_collator = _collator.clone()
+        _primary_collator.strength = _icu.UCOL_PRIMARY
+    return _primary_collator
+
+def sort_collator():
+    'Ignores case differences and recognizes numbers in strings'
+    global _sort_collator
+    if _sort_collator is None:
+        _sort_collator = _collator.clone()
+        _sort_collator.strength = _icu.UCOL_SECONDARY
+        if tweaks['numeric_collation']:
+            try:
+                _sort_collator.numeric = True
+            except AttributeError:
+                pass
+    return _sort_collator
+
+def py_sort_key(obj):
+    if not obj:
+        return _none
+    return obj.lower()
+
+def icu_sort_key(collator, obj):
+    if not obj:
+        return _none2
+    try:
+        try:
+            return _sort_collator.sort_key(obj)
+        except AttributeError:
+            return sort_collator().sort_key(obj)
+    except TypeError:
+        if isinstance(obj, unicode):
+            obj = obj.replace(u'\0', u'')
+        else:
+            obj = obj.replace(b'\0', b'')
+        return _sort_collator.sort_key(obj)
+
+def numeric_collator():
+    global _numeric_collator
+    _numeric_collator = _collator.clone()
+    _numeric_collator.strength = _icu.UCOL_SECONDARY
+    _numeric_collator.numeric = True
+    return _numeric_collator
+
+def numeric_sort_key(obj):
+    'Uses natural sorting for numbers inside strings so something2 will sort before something10'
+    if not obj:
+        return _none2
+    try:
+        try:
+            return _numeric_collator.sort_key(obj)
+        except AttributeError:
+            return numeric_collator().sort_key(obj)
+    except TypeError:
+        if isinstance(obj, unicode):
+            obj = obj.replace(u'\0', u'')
+        else:
+            obj = obj.replace(b'\0', b'')
+        return _numeric_collator.sort_key(obj)
+
+def icu_change_case(upper, locale, obj):
+    func = _icu.upper if upper else _icu.lower
+    try:
+        return func(locale, obj)
+    except TypeError:
+        if isinstance(obj, unicode):
+            obj = obj.replace(u'\0', u'')
+        else:
+            obj = obj.replace(b'\0', b'')
+        return func(locale, obj)
+
+def py_find(pattern, source):
+    pos = source.find(pattern)
+    if pos > -1:
+        return pos, len(pattern)
+    return -1, -1
+
+def character_name(string):
+    try:
+        try:
+            return _icu.character_name(unicode(string)) or None
+        except AttributeError:
+            import unicodedata
+            return unicodedata.name(unicode(string)[0], None)
+    except (TypeError, ValueError, KeyError):
+        pass
+
+def character_name_from_code(code):
+    try:
+        try:
+            return _icu.character_name_from_code(code) or ''
+        except AttributeError:
+            import unicodedata
+            return unicodedata.name(py_safe_chr(code), '')
+    except (TypeError, ValueError, KeyError):
+        return ''
+
+if sys.maxunicode >= 0x10ffff:
+    try:
+        py_safe_chr = unichr
+    except NameError:
+        py_safe_chr = chr
+else:
+    def py_safe_chr(i):
+        # Narrow builds of python cannot represent code point > 0xffff as a
+        # single character, so we need our own implementation of unichr
+        # that returns them as a surrogate pair
+        return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
+
+def safe_chr(code):
+    try:
+        return _icu.chr(code)
+    except AttributeError:
+        return py_safe_chr(code)
+
+def normalize(text, mode='NFC'):
+    # This is very slightly slower than using unicodedata.normalize, so stick with
+    # that unless you have very good reasons not too. Also, it's speed
+    # decreases on wide python builds, where conversion to/from ICU's string
+    # representation is slower.
+    try:
+        return _icu.normalize(_nmodes[mode], unicode(text))
+    except (AttributeError, KeyError):
+        import unicodedata
+        return unicodedata.normalize(mode, unicode(text))
+
+def icu_find(collator, pattern, source):
+    try:
+        return collator.find(pattern, source)
+    except TypeError:
+        return collator.find(unicode(pattern), unicode(source))
+
+def icu_startswith(collator, a, b):
+    try:
+        return collator.startswith(a, b)
+    except TypeError:
+        return collator.startswith(unicode(a), unicode(b))
+
+def py_case_sensitive_sort_key(obj):
+    if not obj:
+        return _none
+    return obj
+
+def icu_case_sensitive_sort_key(collator, obj):
+    if not obj:
+        return _none2
+    return collator.sort_key(obj)
+
+def icu_strcmp(collator, a, b):
+    return collator.strcmp(lower(a), lower(b))
+
+def py_strcmp(a, b):
+    return cmp(a.lower(), b.lower())
+
+def icu_case_sensitive_strcmp(collator, a, b):
+    return collator.strcmp(a, b)
+
+def icu_capitalize(s):
+    s = lower(s)
+    return s.replace(s[0], upper(s[0]), 1) if s else s
+
+_cmap = {}
+def icu_contractions(collator):
+    global _cmap
+    ans = _cmap.get(collator, None)
+    if ans is None:
+        ans = collator.contractions()
+        ans = frozenset(filter(None, ans)) if ans else {}
+        _cmap[collator] = ans
+    return ans
+
+def icu_collation_order(collator, a):
+    try:
+        return collator.collation_order(a)
+    except TypeError:
+        return collator.collation_order(unicode(a))
+
+load_icu()
+load_collator()
+_icu_not_ok = _icu is None or _collator is None
+icu_unicode_version = getattr(_icu, 'unicode_version', None)
+_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
+
+try:
+    senc = sys.getdefaultencoding()
+    if not senc or senc.lower() == 'ascii':
+        _icu.set_default_encoding('utf-8')
+    del senc
+except:
+    pass
+
+try:
+    fenc = sys.getfilesystemencoding()
+    if not fenc or fenc.lower() == 'ascii':
+        _icu.set_filesystem_encoding('utf-8')
+    del fenc
+except:
+    pass
+
+
+# }}}
+
+################# The string functions ########################################
+
+sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
+
+strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
+
+case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
+        partial(icu_case_sensitive_sort_key, _collator)
+
+case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
+
+upper = (lambda s: s.upper()) if _icu_not_ok else \
+    partial(icu_change_case, True, get_locale())
+
+lower = (lambda s: s.lower()) if _icu_not_ok else \
+    partial(icu_change_case, False, get_locale())
+
+title_case = (lambda s: s.title()) if _icu_not_ok else \
+    partial(_icu.title, get_locale())
+
+capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
+    (lambda s: icu_capitalize(s))
+
+find = (py_find if _icu_not_ok else partial(icu_find, _collator))
+
+contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
+    _collator)))
+
+def primary_strcmp(a, b):
+    'strcmp that ignores case and accents on letters'
+    if _icu_not_ok:
+        from calibre.utils.filenames import ascii_text
+        return py_strcmp(ascii_text(a), ascii_text(b))
+    try:
+        return _primary_collator.strcmp(a, b)
+    except AttributeError:
+        return primary_collator().strcmp(a, b)
+
+def primary_find(pat, src):
+    'find that ignores case and accents on letters'
+    if _icu_not_ok:
+        from calibre.utils.filenames import ascii_text
+        return py_find(ascii_text(pat), ascii_text(src))
+    return primary_icu_find(pat, src)
+
+def primary_icu_find(pat, src):
+    try:
+        return icu_find(_primary_collator, pat, src)
+    except AttributeError:
+        return icu_find(primary_collator(), pat, src)
+
+def primary_sort_key(val):
+    'A sort key that ignores case and diacritics'
+    if _icu_not_ok:
+        from calibre.utils.filenames import ascii_text
+        return ascii_text(val).lower()
+    try:
+        return _primary_collator.sort_key(val)
+    except AttributeError:
+        return primary_collator().sort_key(val)
+
+def primary_startswith(a, b):
+    if _icu_not_ok:
+        from calibre.utils.filenames import ascii_text
+        return ascii_text(a).lower().startswith(ascii_text(b).lower())
+    try:
+        return icu_startswith(_primary_collator, a, b)
+    except AttributeError:
+        return icu_startswith(primary_collator(), a, b)
+
+def collation_order(a):
+    if _icu_not_ok:
+        return (ord(a[0]), 1) if a else (0, 0)
+    try:
+        return icu_collation_order(_sort_collator, a)
+    except AttributeError:
+        return icu_collation_order(sort_collator(), a)
+
+################################################################################
+
+def test():  # {{{
+    from calibre import prints
+    # Data {{{
+    german = '''
+    Sonntag
+Montag
+Dienstag
+Januar
+Februar
+März
+Fuße
+Fluße
+Flusse
+flusse
+fluße
+flüße
+flüsse
+'''
+    german_good = '''
+    Dienstag
+Februar
+flusse
+Flusse
+fluße
+Fluße
+flüsse
+flüße
+Fuße
+Januar
+März
+Montag
+Sonntag'''
+    french = '''
+dimanche
+lundi
+mardi
+janvier
+février
+mars
+déjà
+Meme
+deja
+même
+dejà
+bpef
+bœg
+Boef
+Mémé
+bœf
+boef
+bnef
+pêche
+pèché
+pêché
+pêche
+pêché'''
+    french_good = '''
+            bnef
+        boef
+        Boef
+        bœf
+        bœg
+        bpef
+        deja
+        dejà
+        déjà
+        dimanche
+        février
+        janvier
+        lundi
+        mardi
+        mars
+        Meme
+        Mémé
+        même
+        pèché
+        pêche
+        pêche
+        pêché
+        pêché'''
+    # }}}
+
+    def create(l):
+        l = l.decode('utf-8').splitlines()
+        return [x.strip() for x in l if x.strip()]
+
+    def test_strcmp(entries):
+        for x in entries:
+            for y in entries:
+                if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
+                    print 'strcmp failed for %r, %r'%(x, y)
+
+    german = create(german)
+    c = _icu.Collator('de')
+    c.numeric = True
+    gs = list(sorted(german, key=c.sort_key))
+    if gs != create(german_good):
+        print 'German sorting failed'
+        return
+    print
+    french = create(french)
+    c = _icu.Collator('fr')
+    c.numeric = True
+    fs = list(sorted(french, key=c.sort_key))
+    if fs != create(french_good):
+        print 'French sorting failed (note that French fails with icu < 4.6)'
+        return
+    test_strcmp(german + french)
+
+    print '\nTesting case transforms in current locale'
+    from calibre.utils.titlecase import titlecase
+    for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
+        print 'Upper:     ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
+        print 'Lower:     ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
+        print 'Title:     ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
+        print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
+        print
+
+    print '\nTesting primary collation'
+    for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
+            u'Štepánek':u'ŠtepaneK'}.iteritems():
+        if primary_strcmp(k, v) != 0:
+            prints('primary_strcmp() failed with %s != %s'%(k, v))
+            return
+        if primary_find(v, u' '+k)[0] != 1:
+            prints('primary_find() failed with %s not in %s'%(v, k))
+            return
+
+    n = character_name(safe_chr(0x1f431))
+    if n != u'CAT FACE':
+        raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
+
+    global _primary_collator
+    orig = _primary_collator
+    _primary_collator = _icu.Collator('es')
+    if primary_strcmp(u'peña', u'pena') == 0:
+        print 'Primary collation in Spanish locale failed'
+        return
+    _primary_collator = orig
+
+    print '\nTesting contractions'
+    c = _icu.Collator('cs')
+    if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
+        u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
+        u'S\u030c', u'R\u030c']):
+        print 'Contractions for the Czech language failed'
+        return
+
+    print '\nTesting startswith'
+    p = primary_startswith
+    if (not p('asd', 'asd') or not p('asd', 'A') or
+            not p('x', '')):
+        print 'startswith() failed'
+        return
+
+    print '\nTesting collation_order()'
+    for group in [
+        ('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
+        ('calibre', 'Charon', 'Collins'),
+        ('01', '1'),
+        ('1', '11', '13'),
+    ]:
+        last = None
+        for x in group:
+            val = icu_collation_order(sort_collator(), x)
+            if val[1] != 1:
+                prints('collation_order() returned incorrect length for', x)
+            if last is None:
+                last = val
+            else:
+                if val != last:
+                    prints('collation_order() returned incorrect value for', x)
+            last = val
+
+# }}}
+
+def test_roundtrip():
+    for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
+        rp = _icu.roundtrip(r)
+        if rp != r:
+            raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
+
+def test_normalize_performance():
+    import os
+    if not os.path.exists('t.txt'):
+        return
+    raw = open('t.txt', 'rb').read().decode('utf-8')
+    print (len(raw))
+    import time, unicodedata
+    st = time.time()
+    count = 100
+    for i in xrange(count):
+        normalize(raw)
+    print ('ICU time:', time.time() - st)
+    st = time.time()
+    for i in xrange(count):
+        unicodedata.normalize('NFC', unicode(raw))
+    print ('py time:', time.time() - st)
+
+if __name__ == '__main__':
+    test_roundtrip()
+    test_normalize_performance()
+    test()
+