mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Allow merging of icu branch into trunk by falling back to the old icu module if the old binary plugin is detected.
This commit is contained in:
parent
b76cc3e9ab
commit
1f2aa8a55b
@ -247,6 +247,16 @@ def contractions(col=None):
|
|||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
|
if not hasattr(_icu, 'change_case'):
|
||||||
|
print ('You are running from source with an outdated calibre binary install. You'
|
||||||
|
' should update the main calibre binary to at least version 1.28.')
|
||||||
|
# Dont creak calibre for people running from source until the
|
||||||
|
# next binary is available witht he update icu module
|
||||||
|
from calibre.utils.icu_old import * # noqa
|
||||||
|
|
||||||
|
def primary_contains(pat, src):
|
||||||
|
return primary_find(pat, src)[0] != -1
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from calibre.utils.icu_test import run
|
from calibre.utils.icu_test import run
|
||||||
run(verbosity=4)
|
run(verbosity=4)
|
||||||
|
541
src/calibre/utils/icu_old.py
Normal file
541
src/calibre/utils/icu_old.py
Normal file
@ -0,0 +1,541 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
# Setup code {{{
|
||||||
|
import sys
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from calibre.constants import plugins
|
||||||
|
from calibre.utils.config_base import tweaks
|
||||||
|
|
||||||
|
_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
|
||||||
|
_locale = None
|
||||||
|
|
||||||
|
_none = u''
|
||||||
|
_none2 = b''
|
||||||
|
|
||||||
|
def get_locale():
|
||||||
|
global _locale
|
||||||
|
if _locale is None:
|
||||||
|
from calibre.utils.localization import get_lang
|
||||||
|
if tweaks['locale_for_sorting']:
|
||||||
|
_locale = tweaks['locale_for_sorting']
|
||||||
|
else:
|
||||||
|
_locale = get_lang()
|
||||||
|
return _locale
|
||||||
|
|
||||||
|
def load_icu():
|
||||||
|
global _icu
|
||||||
|
if _icu is None:
|
||||||
|
_icu = plugins['icu'][0]
|
||||||
|
if _icu is None:
|
||||||
|
print 'Loading ICU failed with: ', plugins['icu'][1]
|
||||||
|
else:
|
||||||
|
if not getattr(_icu, 'ok', False):
|
||||||
|
print 'icu not ok'
|
||||||
|
_icu = None
|
||||||
|
return _icu
|
||||||
|
|
||||||
|
def load_collator():
|
||||||
|
'The default collator for most locales takes both case and accented letters into account'
|
||||||
|
global _collator
|
||||||
|
if _collator is None:
|
||||||
|
icu = load_icu()
|
||||||
|
if icu is not None:
|
||||||
|
_collator = icu.Collator(get_locale())
|
||||||
|
return _collator
|
||||||
|
|
||||||
|
def primary_collator():
|
||||||
|
'Ignores case differences and accented characters'
|
||||||
|
global _primary_collator
|
||||||
|
if _primary_collator is None:
|
||||||
|
_primary_collator = _collator.clone()
|
||||||
|
_primary_collator.strength = _icu.UCOL_PRIMARY
|
||||||
|
return _primary_collator
|
||||||
|
|
||||||
|
def sort_collator():
|
||||||
|
'Ignores case differences and recognizes numbers in strings'
|
||||||
|
global _sort_collator
|
||||||
|
if _sort_collator is None:
|
||||||
|
_sort_collator = _collator.clone()
|
||||||
|
_sort_collator.strength = _icu.UCOL_SECONDARY
|
||||||
|
if tweaks['numeric_collation']:
|
||||||
|
try:
|
||||||
|
_sort_collator.numeric = True
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
return _sort_collator
|
||||||
|
|
||||||
|
def py_sort_key(obj):
|
||||||
|
if not obj:
|
||||||
|
return _none
|
||||||
|
return obj.lower()
|
||||||
|
|
||||||
|
def icu_sort_key(collator, obj):
|
||||||
|
if not obj:
|
||||||
|
return _none2
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
return _sort_collator.sort_key(obj)
|
||||||
|
except AttributeError:
|
||||||
|
return sort_collator().sort_key(obj)
|
||||||
|
except TypeError:
|
||||||
|
if isinstance(obj, unicode):
|
||||||
|
obj = obj.replace(u'\0', u'')
|
||||||
|
else:
|
||||||
|
obj = obj.replace(b'\0', b'')
|
||||||
|
return _sort_collator.sort_key(obj)
|
||||||
|
|
||||||
|
def numeric_collator():
|
||||||
|
global _numeric_collator
|
||||||
|
_numeric_collator = _collator.clone()
|
||||||
|
_numeric_collator.strength = _icu.UCOL_SECONDARY
|
||||||
|
_numeric_collator.numeric = True
|
||||||
|
return _numeric_collator
|
||||||
|
|
||||||
|
def numeric_sort_key(obj):
|
||||||
|
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
|
||||||
|
if not obj:
|
||||||
|
return _none2
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
return _numeric_collator.sort_key(obj)
|
||||||
|
except AttributeError:
|
||||||
|
return numeric_collator().sort_key(obj)
|
||||||
|
except TypeError:
|
||||||
|
if isinstance(obj, unicode):
|
||||||
|
obj = obj.replace(u'\0', u'')
|
||||||
|
else:
|
||||||
|
obj = obj.replace(b'\0', b'')
|
||||||
|
return _numeric_collator.sort_key(obj)
|
||||||
|
|
||||||
|
def icu_change_case(upper, locale, obj):
|
||||||
|
func = _icu.upper if upper else _icu.lower
|
||||||
|
try:
|
||||||
|
return func(locale, obj)
|
||||||
|
except TypeError:
|
||||||
|
if isinstance(obj, unicode):
|
||||||
|
obj = obj.replace(u'\0', u'')
|
||||||
|
else:
|
||||||
|
obj = obj.replace(b'\0', b'')
|
||||||
|
return func(locale, obj)
|
||||||
|
|
||||||
|
def py_find(pattern, source):
|
||||||
|
pos = source.find(pattern)
|
||||||
|
if pos > -1:
|
||||||
|
return pos, len(pattern)
|
||||||
|
return -1, -1
|
||||||
|
|
||||||
|
def character_name(string):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
return _icu.character_name(unicode(string)) or None
|
||||||
|
except AttributeError:
|
||||||
|
import unicodedata
|
||||||
|
return unicodedata.name(unicode(string)[0], None)
|
||||||
|
except (TypeError, ValueError, KeyError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def character_name_from_code(code):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
return _icu.character_name_from_code(code) or ''
|
||||||
|
except AttributeError:
|
||||||
|
import unicodedata
|
||||||
|
return unicodedata.name(py_safe_chr(code), '')
|
||||||
|
except (TypeError, ValueError, KeyError):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
if sys.maxunicode >= 0x10ffff:
|
||||||
|
try:
|
||||||
|
py_safe_chr = unichr
|
||||||
|
except NameError:
|
||||||
|
py_safe_chr = chr
|
||||||
|
else:
|
||||||
|
def py_safe_chr(i):
|
||||||
|
# Narrow builds of python cannot represent code point > 0xffff as a
|
||||||
|
# single character, so we need our own implementation of unichr
|
||||||
|
# that returns them as a surrogate pair
|
||||||
|
return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
|
||||||
|
|
||||||
|
def safe_chr(code):
|
||||||
|
try:
|
||||||
|
return _icu.chr(code)
|
||||||
|
except AttributeError:
|
||||||
|
return py_safe_chr(code)
|
||||||
|
|
||||||
|
def normalize(text, mode='NFC'):
|
||||||
|
# This is very slightly slower than using unicodedata.normalize, so stick with
|
||||||
|
# that unless you have very good reasons not too. Also, it's speed
|
||||||
|
# decreases on wide python builds, where conversion to/from ICU's string
|
||||||
|
# representation is slower.
|
||||||
|
try:
|
||||||
|
return _icu.normalize(_nmodes[mode], unicode(text))
|
||||||
|
except (AttributeError, KeyError):
|
||||||
|
import unicodedata
|
||||||
|
return unicodedata.normalize(mode, unicode(text))
|
||||||
|
|
||||||
|
def icu_find(collator, pattern, source):
|
||||||
|
try:
|
||||||
|
return collator.find(pattern, source)
|
||||||
|
except TypeError:
|
||||||
|
return collator.find(unicode(pattern), unicode(source))
|
||||||
|
|
||||||
|
def icu_startswith(collator, a, b):
|
||||||
|
try:
|
||||||
|
return collator.startswith(a, b)
|
||||||
|
except TypeError:
|
||||||
|
return collator.startswith(unicode(a), unicode(b))
|
||||||
|
|
||||||
|
def py_case_sensitive_sort_key(obj):
|
||||||
|
if not obj:
|
||||||
|
return _none
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def icu_case_sensitive_sort_key(collator, obj):
|
||||||
|
if not obj:
|
||||||
|
return _none2
|
||||||
|
return collator.sort_key(obj)
|
||||||
|
|
||||||
|
def icu_strcmp(collator, a, b):
|
||||||
|
return collator.strcmp(lower(a), lower(b))
|
||||||
|
|
||||||
|
def py_strcmp(a, b):
|
||||||
|
return cmp(a.lower(), b.lower())
|
||||||
|
|
||||||
|
def icu_case_sensitive_strcmp(collator, a, b):
|
||||||
|
return collator.strcmp(a, b)
|
||||||
|
|
||||||
|
def icu_capitalize(s):
|
||||||
|
s = lower(s)
|
||||||
|
return s.replace(s[0], upper(s[0]), 1) if s else s
|
||||||
|
|
||||||
|
_cmap = {}
|
||||||
|
def icu_contractions(collator):
|
||||||
|
global _cmap
|
||||||
|
ans = _cmap.get(collator, None)
|
||||||
|
if ans is None:
|
||||||
|
ans = collator.contractions()
|
||||||
|
ans = frozenset(filter(None, ans)) if ans else {}
|
||||||
|
_cmap[collator] = ans
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def icu_collation_order(collator, a):
|
||||||
|
try:
|
||||||
|
return collator.collation_order(a)
|
||||||
|
except TypeError:
|
||||||
|
return collator.collation_order(unicode(a))
|
||||||
|
|
||||||
|
load_icu()
|
||||||
|
load_collator()
|
||||||
|
_icu_not_ok = _icu is None or _collator is None
|
||||||
|
icu_unicode_version = getattr(_icu, 'unicode_version', None)
|
||||||
|
_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
|
||||||
|
|
||||||
|
try:
|
||||||
|
senc = sys.getdefaultencoding()
|
||||||
|
if not senc or senc.lower() == 'ascii':
|
||||||
|
_icu.set_default_encoding('utf-8')
|
||||||
|
del senc
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
fenc = sys.getfilesystemencoding()
|
||||||
|
if not fenc or fenc.lower() == 'ascii':
|
||||||
|
_icu.set_filesystem_encoding('utf-8')
|
||||||
|
del fenc
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
################# The string functions ########################################
|
||||||
|
|
||||||
|
sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
|
||||||
|
|
||||||
|
strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
|
||||||
|
|
||||||
|
case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
|
||||||
|
partial(icu_case_sensitive_sort_key, _collator)
|
||||||
|
|
||||||
|
case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
|
||||||
|
|
||||||
|
upper = (lambda s: s.upper()) if _icu_not_ok else \
|
||||||
|
partial(icu_change_case, True, get_locale())
|
||||||
|
|
||||||
|
lower = (lambda s: s.lower()) if _icu_not_ok else \
|
||||||
|
partial(icu_change_case, False, get_locale())
|
||||||
|
|
||||||
|
title_case = (lambda s: s.title()) if _icu_not_ok else \
|
||||||
|
partial(_icu.title, get_locale())
|
||||||
|
|
||||||
|
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
|
||||||
|
(lambda s: icu_capitalize(s))
|
||||||
|
|
||||||
|
find = (py_find if _icu_not_ok else partial(icu_find, _collator))
|
||||||
|
|
||||||
|
contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
|
||||||
|
_collator)))
|
||||||
|
|
||||||
|
def primary_strcmp(a, b):
|
||||||
|
'strcmp that ignores case and accents on letters'
|
||||||
|
if _icu_not_ok:
|
||||||
|
from calibre.utils.filenames import ascii_text
|
||||||
|
return py_strcmp(ascii_text(a), ascii_text(b))
|
||||||
|
try:
|
||||||
|
return _primary_collator.strcmp(a, b)
|
||||||
|
except AttributeError:
|
||||||
|
return primary_collator().strcmp(a, b)
|
||||||
|
|
||||||
|
def primary_find(pat, src):
|
||||||
|
'find that ignores case and accents on letters'
|
||||||
|
if _icu_not_ok:
|
||||||
|
from calibre.utils.filenames import ascii_text
|
||||||
|
return py_find(ascii_text(pat), ascii_text(src))
|
||||||
|
return primary_icu_find(pat, src)
|
||||||
|
|
||||||
|
def primary_icu_find(pat, src):
|
||||||
|
try:
|
||||||
|
return icu_find(_primary_collator, pat, src)
|
||||||
|
except AttributeError:
|
||||||
|
return icu_find(primary_collator(), pat, src)
|
||||||
|
|
||||||
|
def primary_sort_key(val):
|
||||||
|
'A sort key that ignores case and diacritics'
|
||||||
|
if _icu_not_ok:
|
||||||
|
from calibre.utils.filenames import ascii_text
|
||||||
|
return ascii_text(val).lower()
|
||||||
|
try:
|
||||||
|
return _primary_collator.sort_key(val)
|
||||||
|
except AttributeError:
|
||||||
|
return primary_collator().sort_key(val)
|
||||||
|
|
||||||
|
def primary_startswith(a, b):
|
||||||
|
if _icu_not_ok:
|
||||||
|
from calibre.utils.filenames import ascii_text
|
||||||
|
return ascii_text(a).lower().startswith(ascii_text(b).lower())
|
||||||
|
try:
|
||||||
|
return icu_startswith(_primary_collator, a, b)
|
||||||
|
except AttributeError:
|
||||||
|
return icu_startswith(primary_collator(), a, b)
|
||||||
|
|
||||||
|
def collation_order(a):
|
||||||
|
if _icu_not_ok:
|
||||||
|
return (ord(a[0]), 1) if a else (0, 0)
|
||||||
|
try:
|
||||||
|
return icu_collation_order(_sort_collator, a)
|
||||||
|
except AttributeError:
|
||||||
|
return icu_collation_order(sort_collator(), a)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
def test(): # {{{
|
||||||
|
from calibre import prints
|
||||||
|
# Data {{{
|
||||||
|
german = '''
|
||||||
|
Sonntag
|
||||||
|
Montag
|
||||||
|
Dienstag
|
||||||
|
Januar
|
||||||
|
Februar
|
||||||
|
März
|
||||||
|
Fuße
|
||||||
|
Fluße
|
||||||
|
Flusse
|
||||||
|
flusse
|
||||||
|
fluße
|
||||||
|
flüße
|
||||||
|
flüsse
|
||||||
|
'''
|
||||||
|
german_good = '''
|
||||||
|
Dienstag
|
||||||
|
Februar
|
||||||
|
flusse
|
||||||
|
Flusse
|
||||||
|
fluße
|
||||||
|
Fluße
|
||||||
|
flüsse
|
||||||
|
flüße
|
||||||
|
Fuße
|
||||||
|
Januar
|
||||||
|
März
|
||||||
|
Montag
|
||||||
|
Sonntag'''
|
||||||
|
french = '''
|
||||||
|
dimanche
|
||||||
|
lundi
|
||||||
|
mardi
|
||||||
|
janvier
|
||||||
|
février
|
||||||
|
mars
|
||||||
|
déjà
|
||||||
|
Meme
|
||||||
|
deja
|
||||||
|
même
|
||||||
|
dejà
|
||||||
|
bpef
|
||||||
|
bœg
|
||||||
|
Boef
|
||||||
|
Mémé
|
||||||
|
bœf
|
||||||
|
boef
|
||||||
|
bnef
|
||||||
|
pêche
|
||||||
|
pèché
|
||||||
|
pêché
|
||||||
|
pêche
|
||||||
|
pêché'''
|
||||||
|
french_good = '''
|
||||||
|
bnef
|
||||||
|
boef
|
||||||
|
Boef
|
||||||
|
bœf
|
||||||
|
bœg
|
||||||
|
bpef
|
||||||
|
deja
|
||||||
|
dejà
|
||||||
|
déjà
|
||||||
|
dimanche
|
||||||
|
février
|
||||||
|
janvier
|
||||||
|
lundi
|
||||||
|
mardi
|
||||||
|
mars
|
||||||
|
Meme
|
||||||
|
Mémé
|
||||||
|
même
|
||||||
|
pèché
|
||||||
|
pêche
|
||||||
|
pêche
|
||||||
|
pêché
|
||||||
|
pêché'''
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def create(l):
|
||||||
|
l = l.decode('utf-8').splitlines()
|
||||||
|
return [x.strip() for x in l if x.strip()]
|
||||||
|
|
||||||
|
def test_strcmp(entries):
|
||||||
|
for x in entries:
|
||||||
|
for y in entries:
|
||||||
|
if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
|
||||||
|
print 'strcmp failed for %r, %r'%(x, y)
|
||||||
|
|
||||||
|
german = create(german)
|
||||||
|
c = _icu.Collator('de')
|
||||||
|
c.numeric = True
|
||||||
|
gs = list(sorted(german, key=c.sort_key))
|
||||||
|
if gs != create(german_good):
|
||||||
|
print 'German sorting failed'
|
||||||
|
return
|
||||||
|
print
|
||||||
|
french = create(french)
|
||||||
|
c = _icu.Collator('fr')
|
||||||
|
c.numeric = True
|
||||||
|
fs = list(sorted(french, key=c.sort_key))
|
||||||
|
if fs != create(french_good):
|
||||||
|
print 'French sorting failed (note that French fails with icu < 4.6)'
|
||||||
|
return
|
||||||
|
test_strcmp(german + french)
|
||||||
|
|
||||||
|
print '\nTesting case transforms in current locale'
|
||||||
|
from calibre.utils.titlecase import titlecase
|
||||||
|
for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
|
||||||
|
print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
|
||||||
|
print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
|
||||||
|
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
|
||||||
|
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
|
||||||
|
print
|
||||||
|
|
||||||
|
print '\nTesting primary collation'
|
||||||
|
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
|
||||||
|
u'Štepánek':u'ŠtepaneK'}.iteritems():
|
||||||
|
if primary_strcmp(k, v) != 0:
|
||||||
|
prints('primary_strcmp() failed with %s != %s'%(k, v))
|
||||||
|
return
|
||||||
|
if primary_find(v, u' '+k)[0] != 1:
|
||||||
|
prints('primary_find() failed with %s not in %s'%(v, k))
|
||||||
|
return
|
||||||
|
|
||||||
|
n = character_name(safe_chr(0x1f431))
|
||||||
|
if n != u'CAT FACE':
|
||||||
|
raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
|
||||||
|
|
||||||
|
global _primary_collator
|
||||||
|
orig = _primary_collator
|
||||||
|
_primary_collator = _icu.Collator('es')
|
||||||
|
if primary_strcmp(u'peña', u'pena') == 0:
|
||||||
|
print 'Primary collation in Spanish locale failed'
|
||||||
|
return
|
||||||
|
_primary_collator = orig
|
||||||
|
|
||||||
|
print '\nTesting contractions'
|
||||||
|
c = _icu.Collator('cs')
|
||||||
|
if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
|
||||||
|
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
|
||||||
|
u'S\u030c', u'R\u030c']):
|
||||||
|
print 'Contractions for the Czech language failed'
|
||||||
|
return
|
||||||
|
|
||||||
|
print '\nTesting startswith'
|
||||||
|
p = primary_startswith
|
||||||
|
if (not p('asd', 'asd') or not p('asd', 'A') or
|
||||||
|
not p('x', '')):
|
||||||
|
print 'startswith() failed'
|
||||||
|
return
|
||||||
|
|
||||||
|
print '\nTesting collation_order()'
|
||||||
|
for group in [
|
||||||
|
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
|
||||||
|
('calibre', 'Charon', 'Collins'),
|
||||||
|
('01', '1'),
|
||||||
|
('1', '11', '13'),
|
||||||
|
]:
|
||||||
|
last = None
|
||||||
|
for x in group:
|
||||||
|
val = icu_collation_order(sort_collator(), x)
|
||||||
|
if val[1] != 1:
|
||||||
|
prints('collation_order() returned incorrect length for', x)
|
||||||
|
if last is None:
|
||||||
|
last = val
|
||||||
|
else:
|
||||||
|
if val != last:
|
||||||
|
prints('collation_order() returned incorrect value for', x)
|
||||||
|
last = val
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def test_roundtrip():
|
||||||
|
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
||||||
|
rp = _icu.roundtrip(r)
|
||||||
|
if rp != r:
|
||||||
|
raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
|
||||||
|
|
||||||
|
def test_normalize_performance():
|
||||||
|
import os
|
||||||
|
if not os.path.exists('t.txt'):
|
||||||
|
return
|
||||||
|
raw = open('t.txt', 'rb').read().decode('utf-8')
|
||||||
|
print (len(raw))
|
||||||
|
import time, unicodedata
|
||||||
|
st = time.time()
|
||||||
|
count = 100
|
||||||
|
for i in xrange(count):
|
||||||
|
normalize(raw)
|
||||||
|
print ('ICU time:', time.time() - st)
|
||||||
|
st = time.time()
|
||||||
|
for i in xrange(count):
|
||||||
|
unicodedata.normalize('NFC', unicode(raw))
|
||||||
|
print ('py time:', time.time() - st)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_roundtrip()
|
||||||
|
test_normalize_performance()
|
||||||
|
test()
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user