Allow merging of icu branch into trunk by falling back to the old icu module if the old binary plugin is detected.

2025-07-09 03:04:10 -04:00 · 2014-03-08 22:18:29 +05:30 · 2014-03-08 22:18:29 +05:30 · 1f2aa8a55b
commit 1f2aa8a55b
parent b76cc3e9ab
2 changed files with 551 additions and 0 deletions
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -247,6 +247,16 @@ def contractions(col=None):
 ################################################################################
 if not hasattr(_icu, 'change_case'):
    print ('You are running from source with an outdated calibre binary install. You'
           ' should update the main calibre binary to at least version 1.28.')
    # Dont creak calibre for people running from source until the
    # next binary is available witht he update icu module
    from calibre.utils.icu_old import *  # noqa
    def primary_contains(pat, src):
        return primary_find(pat, src)[0] != -1
 if __name__ == '__main__':
    from calibre.utils.icu_test import run
    run(verbosity=4)
--- a/src/calibre/utils/icu_old.py
+++ b/src/calibre/utils/icu_old.py
@ -0,0 +1,541 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 # Setup code {{{
 import sys
 from functools import partial
 from calibre.constants import plugins
 from calibre.utils.config_base import tweaks
 _icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
 _locale = None
 _none = u''
 _none2 = b''
 def get_locale():
    global _locale
    if _locale is None:
        from calibre.utils.localization import get_lang
        if tweaks['locale_for_sorting']:
            _locale = tweaks['locale_for_sorting']
        else:
            _locale = get_lang()
    return _locale
 def load_icu():
    global _icu
    if _icu is None:
        _icu = plugins['icu'][0]
        if _icu is None:
            print 'Loading ICU failed with: ', plugins['icu'][1]
        else:
            if not getattr(_icu, 'ok', False):
                print 'icu not ok'
                _icu = None
    return _icu
 def load_collator():
    'The default collator for most locales takes both case and accented letters into account'
    global _collator
    if _collator is None:
        icu = load_icu()
        if icu is not None:
            _collator = icu.Collator(get_locale())
    return _collator
 def primary_collator():
    'Ignores case differences and accented characters'
    global _primary_collator
    if _primary_collator is None:
        _primary_collator = _collator.clone()
        _primary_collator.strength = _icu.UCOL_PRIMARY
    return _primary_collator
 def sort_collator():
    'Ignores case differences and recognizes numbers in strings'
    global _sort_collator
    if _sort_collator is None:
        _sort_collator = _collator.clone()
        _sort_collator.strength = _icu.UCOL_SECONDARY
        if tweaks['numeric_collation']:
            try:
                _sort_collator.numeric = True
            except AttributeError:
                pass
    return _sort_collator
 def py_sort_key(obj):
    if not obj:
        return _none
    return obj.lower()
 def icu_sort_key(collator, obj):
    if not obj:
        return _none2
    try:
        try:
            return _sort_collator.sort_key(obj)
        except AttributeError:
            return sort_collator().sort_key(obj)
    except TypeError:
        if isinstance(obj, unicode):
            obj = obj.replace(u'\0', u'')
        else:
            obj = obj.replace(b'\0', b'')
        return _sort_collator.sort_key(obj)
 def numeric_collator():
    global _numeric_collator
    _numeric_collator = _collator.clone()
    _numeric_collator.strength = _icu.UCOL_SECONDARY
    _numeric_collator.numeric = True
    return _numeric_collator
 def numeric_sort_key(obj):
    'Uses natural sorting for numbers inside strings so something2 will sort before something10'
    if not obj:
        return _none2
    try:
        try:
            return _numeric_collator.sort_key(obj)
        except AttributeError:
            return numeric_collator().sort_key(obj)
    except TypeError:
        if isinstance(obj, unicode):
            obj = obj.replace(u'\0', u'')
        else:
            obj = obj.replace(b'\0', b'')
        return _numeric_collator.sort_key(obj)
 def icu_change_case(upper, locale, obj):
    func = _icu.upper if upper else _icu.lower
    try:
        return func(locale, obj)
    except TypeError:
        if isinstance(obj, unicode):
            obj = obj.replace(u'\0', u'')
        else:
            obj = obj.replace(b'\0', b'')
        return func(locale, obj)
 def py_find(pattern, source):
    pos = source.find(pattern)
    if pos > -1:
        return pos, len(pattern)
    return -1, -1
 def character_name(string):
    try:
        try:
            return _icu.character_name(unicode(string)) or None
        except AttributeError:
            import unicodedata
            return unicodedata.name(unicode(string)[0], None)
    except (TypeError, ValueError, KeyError):
        pass
 def character_name_from_code(code):
    try:
        try:
            return _icu.character_name_from_code(code) or ''
        except AttributeError:
            import unicodedata
            return unicodedata.name(py_safe_chr(code), '')
    except (TypeError, ValueError, KeyError):
        return ''
 if sys.maxunicode >= 0x10ffff:
    try:
        py_safe_chr = unichr
    except NameError:
        py_safe_chr = chr
 else:
    def py_safe_chr(i):
        # Narrow builds of python cannot represent code point > 0xffff as a
        # single character, so we need our own implementation of unichr
        # that returns them as a surrogate pair
        return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
 def safe_chr(code):
    try:
        return _icu.chr(code)
    except AttributeError:
        return py_safe_chr(code)
 def normalize(text, mode='NFC'):
    # This is very slightly slower than using unicodedata.normalize, so stick with
    # that unless you have very good reasons not too. Also, it's speed
    # decreases on wide python builds, where conversion to/from ICU's string
    # representation is slower.
    try:
        return _icu.normalize(_nmodes[mode], unicode(text))
    except (AttributeError, KeyError):
        import unicodedata
        return unicodedata.normalize(mode, unicode(text))
 def icu_find(collator, pattern, source):
    try:
        return collator.find(pattern, source)
    except TypeError:
        return collator.find(unicode(pattern), unicode(source))
 def icu_startswith(collator, a, b):
    try:
        return collator.startswith(a, b)
    except TypeError:
        return collator.startswith(unicode(a), unicode(b))
 def py_case_sensitive_sort_key(obj):
    if not obj:
        return _none
    return obj
 def icu_case_sensitive_sort_key(collator, obj):
    if not obj:
        return _none2
    return collator.sort_key(obj)
 def icu_strcmp(collator, a, b):
    return collator.strcmp(lower(a), lower(b))
 def py_strcmp(a, b):
    return cmp(a.lower(), b.lower())
 def icu_case_sensitive_strcmp(collator, a, b):
    return collator.strcmp(a, b)
 def icu_capitalize(s):
    s = lower(s)
    return s.replace(s[0], upper(s[0]), 1) if s else s
 _cmap = {}
 def icu_contractions(collator):
    global _cmap
    ans = _cmap.get(collator, None)
    if ans is None:
        ans = collator.contractions()
        ans = frozenset(filter(None, ans)) if ans else {}
        _cmap[collator] = ans
    return ans
 def icu_collation_order(collator, a):
    try:
        return collator.collation_order(a)
    except TypeError:
        return collator.collation_order(unicode(a))
 load_icu()
 load_collator()
 _icu_not_ok = _icu is None or _collator is None
 icu_unicode_version = getattr(_icu, 'unicode_version', None)
 _nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
 try:
    senc = sys.getdefaultencoding()
    if not senc or senc.lower() == 'ascii':
        _icu.set_default_encoding('utf-8')
    del senc
 except:
    pass
 try:
    fenc = sys.getfilesystemencoding()
    if not fenc or fenc.lower() == 'ascii':
        _icu.set_filesystem_encoding('utf-8')
    del fenc
 except:
    pass
 # }}}
 ################# The string functions ########################################
 sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
 strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
 case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
        partial(icu_case_sensitive_sort_key, _collator)
 case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
 upper = (lambda s: s.upper()) if _icu_not_ok else \
    partial(icu_change_case, True, get_locale())
 lower = (lambda s: s.lower()) if _icu_not_ok else \
    partial(icu_change_case, False, get_locale())
 title_case = (lambda s: s.title()) if _icu_not_ok else \
    partial(_icu.title, get_locale())
 capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
    (lambda s: icu_capitalize(s))
 find = (py_find if _icu_not_ok else partial(icu_find, _collator))
 contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
    _collator)))
 def primary_strcmp(a, b):
    'strcmp that ignores case and accents on letters'
    if _icu_not_ok:
        from calibre.utils.filenames import ascii_text
        return py_strcmp(ascii_text(a), ascii_text(b))
    try:
        return _primary_collator.strcmp(a, b)
    except AttributeError:
        return primary_collator().strcmp(a, b)
 def primary_find(pat, src):
    'find that ignores case and accents on letters'
    if _icu_not_ok:
        from calibre.utils.filenames import ascii_text
        return py_find(ascii_text(pat), ascii_text(src))
    return primary_icu_find(pat, src)
 def primary_icu_find(pat, src):
    try:
        return icu_find(_primary_collator, pat, src)
    except AttributeError:
        return icu_find(primary_collator(), pat, src)
 def primary_sort_key(val):
    'A sort key that ignores case and diacritics'
    if _icu_not_ok:
        from calibre.utils.filenames import ascii_text
        return ascii_text(val).lower()
    try:
        return _primary_collator.sort_key(val)
    except AttributeError:
        return primary_collator().sort_key(val)
 def primary_startswith(a, b):
    if _icu_not_ok:
        from calibre.utils.filenames import ascii_text
        return ascii_text(a).lower().startswith(ascii_text(b).lower())
    try:
        return icu_startswith(_primary_collator, a, b)
    except AttributeError:
        return icu_startswith(primary_collator(), a, b)
 def collation_order(a):
    if _icu_not_ok:
        return (ord(a[0]), 1) if a else (0, 0)
    try:
        return icu_collation_order(_sort_collator, a)
    except AttributeError:
        return icu_collation_order(sort_collator(), a)
 ################################################################################
 def test():  # {{{
    from calibre import prints
    # Data {{{
    german = '''
    Sonntag
 Montag
 Dienstag
 Januar
 Februar
 März
 Fuße
 Fluße
 Flusse
 flusse
 fluße
 flüße
 flüsse
 '''
    german_good = '''
    Dienstag
 Februar
 flusse
 Flusse
 fluße
 Fluße
 flüsse
 flüße
 Fuße
 Januar
 März
 Montag
 Sonntag'''
    french = '''
 dimanche
 lundi
 mardi
 janvier
 février
 mars
 déjà
 Meme
 deja
 même
 dejà
 bpef
 bœg
 Boef
 Mémé
 bœf
 boef
 bnef
 pêche
 pèché
 pêché
 pêche
 pêché'''
    french_good = '''
            bnef
        boef
        Boef
        bœf
        bœg
        bpef
        deja
        dejà
        déjà
        dimanche
        février
        janvier
        lundi
        mardi
        mars
        Meme
        Mémé
        même
        pèché
        pêche
        pêche
        pêché
        pêché'''
    # }}}
    def create(l):
        l = l.decode('utf-8').splitlines()
        return [x.strip() for x in l if x.strip()]
    def test_strcmp(entries):
        for x in entries:
            for y in entries:
                if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
                    print 'strcmp failed for %r, %r'%(x, y)
    german = create(german)
    c = _icu.Collator('de')
    c.numeric = True
    gs = list(sorted(german, key=c.sort_key))
    if gs != create(german_good):
        print 'German sorting failed'
        return
    print
    french = create(french)
    c = _icu.Collator('fr')
    c.numeric = True
    fs = list(sorted(french, key=c.sort_key))
    if fs != create(french_good):
        print 'French sorting failed (note that French fails with icu < 4.6)'
        return
    test_strcmp(german + french)
    print '\nTesting case transforms in current locale'
    from calibre.utils.titlecase import titlecase
    for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
        print 'Upper:     ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
        print 'Lower:     ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
        print 'Title:     ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
        print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
        print
    print '\nTesting primary collation'
    for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
            u'Štepánek':u'ŠtepaneK'}.iteritems():
        if primary_strcmp(k, v) != 0:
            prints('primary_strcmp() failed with %s != %s'%(k, v))
            return
        if primary_find(v, u' '+k)[0] != 1:
            prints('primary_find() failed with %s not in %s'%(v, k))
            return
    n = character_name(safe_chr(0x1f431))
    if n != u'CAT FACE':
        raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
    global _primary_collator
    orig = _primary_collator
    _primary_collator = _icu.Collator('es')
    if primary_strcmp(u'peña', u'pena') == 0:
        print 'Primary collation in Spanish locale failed'
        return
    _primary_collator = orig
    print '\nTesting contractions'
    c = _icu.Collator('cs')
    if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
        u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
        u'S\u030c', u'R\u030c']):
        print 'Contractions for the Czech language failed'
        return
    print '\nTesting startswith'
    p = primary_startswith
    if (not p('asd', 'asd') or not p('asd', 'A') or
            not p('x', '')):
        print 'startswith() failed'
        return
    print '\nTesting collation_order()'
    for group in [
        ('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
        ('calibre', 'Charon', 'Collins'),
        ('01', '1'),
        ('1', '11', '13'),
    ]:
        last = None
        for x in group:
            val = icu_collation_order(sort_collator(), x)
            if val[1] != 1:
                prints('collation_order() returned incorrect length for', x)
            if last is None:
                last = val
            else:
                if val != last:
                    prints('collation_order() returned incorrect value for', x)
            last = val
 # }}}
 def test_roundtrip():
    for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
        rp = _icu.roundtrip(r)
        if rp != r:
            raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
 def test_normalize_performance():
    import os
    if not os.path.exists('t.txt'):
        return
    raw = open('t.txt', 'rb').read().decode('utf-8')
    print (len(raw))
    import time, unicodedata
    st = time.time()
    count = 100
    for i in xrange(count):
        normalize(raw)
    print ('ICU time:', time.time() - st)
    st = time.time()
    for i in xrange(count):
        unicodedata.normalize('NFC', unicode(raw))
    print ('py time:', time.time() - st)
 if __name__ == '__main__':
    test_roundtrip()
    test_normalize_performance()
    test()