mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get rid of the backwards compat code for people running from source that have not updated their binary calibre builds as 1.28 has been out for a while
This commit is contained in:
parent
7256c9bf4e
commit
828406fdc2
@ -251,16 +251,6 @@ def contractions(col=None):
|
|||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
if not hasattr(_icu, 'change_case'):
|
|
||||||
print ('You are running from source with an outdated calibre binary install. You'
|
|
||||||
' should update the main calibre binary to at least version 1.28.')
|
|
||||||
# Dont creak calibre for people running from source until the
|
|
||||||
# next binary is available witht he update icu module
|
|
||||||
from calibre.utils.icu_old import * # noqa
|
|
||||||
|
|
||||||
def primary_contains(pat, src):
|
|
||||||
return primary_find(pat, src)[0] != -1
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from calibre.utils.icu_test import run
|
from calibre.utils.icu_test import run
|
||||||
run(verbosity=4)
|
run(verbosity=4)
|
||||||
|
@ -1,541 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
# Setup code {{{
|
|
||||||
import sys
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
from calibre.constants import plugins
|
|
||||||
from calibre.utils.config_base import tweaks
|
|
||||||
|
|
||||||
_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
|
|
||||||
_locale = None
|
|
||||||
|
|
||||||
_none = u''
|
|
||||||
_none2 = b''
|
|
||||||
|
|
||||||
def get_locale():
|
|
||||||
global _locale
|
|
||||||
if _locale is None:
|
|
||||||
from calibre.utils.localization import get_lang
|
|
||||||
if tweaks['locale_for_sorting']:
|
|
||||||
_locale = tweaks['locale_for_sorting']
|
|
||||||
else:
|
|
||||||
_locale = get_lang()
|
|
||||||
return _locale
|
|
||||||
|
|
||||||
def load_icu():
|
|
||||||
global _icu
|
|
||||||
if _icu is None:
|
|
||||||
_icu = plugins['icu'][0]
|
|
||||||
if _icu is None:
|
|
||||||
print 'Loading ICU failed with: ', plugins['icu'][1]
|
|
||||||
else:
|
|
||||||
if not getattr(_icu, 'ok', False):
|
|
||||||
print 'icu not ok'
|
|
||||||
_icu = None
|
|
||||||
return _icu
|
|
||||||
|
|
||||||
def load_collator():
|
|
||||||
'The default collator for most locales takes both case and accented letters into account'
|
|
||||||
global _collator
|
|
||||||
if _collator is None:
|
|
||||||
icu = load_icu()
|
|
||||||
if icu is not None:
|
|
||||||
_collator = icu.Collator(get_locale())
|
|
||||||
return _collator
|
|
||||||
|
|
||||||
def primary_collator():
|
|
||||||
'Ignores case differences and accented characters'
|
|
||||||
global _primary_collator
|
|
||||||
if _primary_collator is None:
|
|
||||||
_primary_collator = _collator.clone()
|
|
||||||
_primary_collator.strength = _icu.UCOL_PRIMARY
|
|
||||||
return _primary_collator
|
|
||||||
|
|
||||||
def sort_collator():
|
|
||||||
'Ignores case differences and recognizes numbers in strings'
|
|
||||||
global _sort_collator
|
|
||||||
if _sort_collator is None:
|
|
||||||
_sort_collator = _collator.clone()
|
|
||||||
_sort_collator.strength = _icu.UCOL_SECONDARY
|
|
||||||
if tweaks['numeric_collation']:
|
|
||||||
try:
|
|
||||||
_sort_collator.numeric = True
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
return _sort_collator
|
|
||||||
|
|
||||||
def py_sort_key(obj):
|
|
||||||
if not obj:
|
|
||||||
return _none
|
|
||||||
return obj.lower()
|
|
||||||
|
|
||||||
def icu_sort_key(collator, obj):
|
|
||||||
if not obj:
|
|
||||||
return _none2
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _sort_collator.sort_key(obj)
|
|
||||||
except AttributeError:
|
|
||||||
return sort_collator().sort_key(obj)
|
|
||||||
except TypeError:
|
|
||||||
if isinstance(obj, unicode):
|
|
||||||
obj = obj.replace(u'\0', u'')
|
|
||||||
else:
|
|
||||||
obj = obj.replace(b'\0', b'')
|
|
||||||
return _sort_collator.sort_key(obj)
|
|
||||||
|
|
||||||
def numeric_collator():
|
|
||||||
global _numeric_collator
|
|
||||||
_numeric_collator = _collator.clone()
|
|
||||||
_numeric_collator.strength = _icu.UCOL_SECONDARY
|
|
||||||
_numeric_collator.numeric = True
|
|
||||||
return _numeric_collator
|
|
||||||
|
|
||||||
def numeric_sort_key(obj):
|
|
||||||
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
|
|
||||||
if not obj:
|
|
||||||
return _none2
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _numeric_collator.sort_key(obj)
|
|
||||||
except AttributeError:
|
|
||||||
return numeric_collator().sort_key(obj)
|
|
||||||
except TypeError:
|
|
||||||
if isinstance(obj, unicode):
|
|
||||||
obj = obj.replace(u'\0', u'')
|
|
||||||
else:
|
|
||||||
obj = obj.replace(b'\0', b'')
|
|
||||||
return _numeric_collator.sort_key(obj)
|
|
||||||
|
|
||||||
def icu_change_case(upper, locale, obj):
|
|
||||||
func = _icu.upper if upper else _icu.lower
|
|
||||||
try:
|
|
||||||
return func(locale, obj)
|
|
||||||
except TypeError:
|
|
||||||
if isinstance(obj, unicode):
|
|
||||||
obj = obj.replace(u'\0', u'')
|
|
||||||
else:
|
|
||||||
obj = obj.replace(b'\0', b'')
|
|
||||||
return func(locale, obj)
|
|
||||||
|
|
||||||
def py_find(pattern, source):
|
|
||||||
pos = source.find(pattern)
|
|
||||||
if pos > -1:
|
|
||||||
return pos, len(pattern)
|
|
||||||
return -1, -1
|
|
||||||
|
|
||||||
def character_name(string):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _icu.character_name(unicode(string)) or None
|
|
||||||
except AttributeError:
|
|
||||||
import unicodedata
|
|
||||||
return unicodedata.name(unicode(string)[0], None)
|
|
||||||
except (TypeError, ValueError, KeyError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def character_name_from_code(code):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
return _icu.character_name_from_code(code) or ''
|
|
||||||
except AttributeError:
|
|
||||||
import unicodedata
|
|
||||||
return unicodedata.name(py_safe_chr(code), '')
|
|
||||||
except (TypeError, ValueError, KeyError):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
if sys.maxunicode >= 0x10ffff:
|
|
||||||
try:
|
|
||||||
py_safe_chr = unichr
|
|
||||||
except NameError:
|
|
||||||
py_safe_chr = chr
|
|
||||||
else:
|
|
||||||
def py_safe_chr(i):
|
|
||||||
# Narrow builds of python cannot represent code point > 0xffff as a
|
|
||||||
# single character, so we need our own implementation of unichr
|
|
||||||
# that returns them as a surrogate pair
|
|
||||||
return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
|
|
||||||
|
|
||||||
def safe_chr(code):
|
|
||||||
try:
|
|
||||||
return _icu.chr(code)
|
|
||||||
except AttributeError:
|
|
||||||
return py_safe_chr(code)
|
|
||||||
|
|
||||||
def normalize(text, mode='NFC'):
|
|
||||||
# This is very slightly slower than using unicodedata.normalize, so stick with
|
|
||||||
# that unless you have very good reasons not too. Also, it's speed
|
|
||||||
# decreases on wide python builds, where conversion to/from ICU's string
|
|
||||||
# representation is slower.
|
|
||||||
try:
|
|
||||||
return _icu.normalize(_nmodes[mode], unicode(text))
|
|
||||||
except (AttributeError, KeyError):
|
|
||||||
import unicodedata
|
|
||||||
return unicodedata.normalize(mode, unicode(text))
|
|
||||||
|
|
||||||
def icu_find(collator, pattern, source):
|
|
||||||
try:
|
|
||||||
return collator.find(pattern, source)
|
|
||||||
except TypeError:
|
|
||||||
return collator.find(unicode(pattern), unicode(source))
|
|
||||||
|
|
||||||
def icu_startswith(collator, a, b):
|
|
||||||
try:
|
|
||||||
return collator.startswith(a, b)
|
|
||||||
except TypeError:
|
|
||||||
return collator.startswith(unicode(a), unicode(b))
|
|
||||||
|
|
||||||
def py_case_sensitive_sort_key(obj):
|
|
||||||
if not obj:
|
|
||||||
return _none
|
|
||||||
return obj
|
|
||||||
|
|
||||||
def icu_case_sensitive_sort_key(collator, obj):
|
|
||||||
if not obj:
|
|
||||||
return _none2
|
|
||||||
return collator.sort_key(obj)
|
|
||||||
|
|
||||||
def icu_strcmp(collator, a, b):
|
|
||||||
return collator.strcmp(lower(a), lower(b))
|
|
||||||
|
|
||||||
def py_strcmp(a, b):
|
|
||||||
return cmp(a.lower(), b.lower())
|
|
||||||
|
|
||||||
def icu_case_sensitive_strcmp(collator, a, b):
|
|
||||||
return collator.strcmp(a, b)
|
|
||||||
|
|
||||||
def icu_capitalize(s):
|
|
||||||
s = lower(s)
|
|
||||||
return s.replace(s[0], upper(s[0]), 1) if s else s
|
|
||||||
|
|
||||||
_cmap = {}
|
|
||||||
def icu_contractions(collator):
|
|
||||||
global _cmap
|
|
||||||
ans = _cmap.get(collator, None)
|
|
||||||
if ans is None:
|
|
||||||
ans = collator.contractions()
|
|
||||||
ans = frozenset(filter(None, ans)) if ans else {}
|
|
||||||
_cmap[collator] = ans
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def icu_collation_order(collator, a):
|
|
||||||
try:
|
|
||||||
return collator.collation_order(a)
|
|
||||||
except TypeError:
|
|
||||||
return collator.collation_order(unicode(a))
|
|
||||||
|
|
||||||
load_icu()
|
|
||||||
load_collator()
|
|
||||||
_icu_not_ok = _icu is None or _collator is None
|
|
||||||
icu_unicode_version = getattr(_icu, 'unicode_version', None)
|
|
||||||
_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
|
|
||||||
|
|
||||||
try:
|
|
||||||
senc = sys.getdefaultencoding()
|
|
||||||
if not senc or senc.lower() == 'ascii':
|
|
||||||
_icu.set_default_encoding('utf-8')
|
|
||||||
del senc
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
fenc = sys.getfilesystemencoding()
|
|
||||||
if not fenc or fenc.lower() == 'ascii':
|
|
||||||
_icu.set_filesystem_encoding('utf-8')
|
|
||||||
del fenc
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
################# The string functions ########################################
|
|
||||||
|
|
||||||
sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
|
|
||||||
|
|
||||||
strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
|
|
||||||
|
|
||||||
case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
|
|
||||||
partial(icu_case_sensitive_sort_key, _collator)
|
|
||||||
|
|
||||||
case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
|
|
||||||
|
|
||||||
upper = (lambda s: s.upper()) if _icu_not_ok else \
|
|
||||||
partial(icu_change_case, True, get_locale())
|
|
||||||
|
|
||||||
lower = (lambda s: s.lower()) if _icu_not_ok else \
|
|
||||||
partial(icu_change_case, False, get_locale())
|
|
||||||
|
|
||||||
title_case = (lambda s: s.title()) if _icu_not_ok else \
|
|
||||||
partial(_icu.title, get_locale())
|
|
||||||
|
|
||||||
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
|
|
||||||
(lambda s: icu_capitalize(s))
|
|
||||||
|
|
||||||
find = (py_find if _icu_not_ok else partial(icu_find, _collator))
|
|
||||||
|
|
||||||
contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
|
|
||||||
_collator)))
|
|
||||||
|
|
||||||
def primary_strcmp(a, b):
|
|
||||||
'strcmp that ignores case and accents on letters'
|
|
||||||
if _icu_not_ok:
|
|
||||||
from calibre.utils.filenames import ascii_text
|
|
||||||
return py_strcmp(ascii_text(a), ascii_text(b))
|
|
||||||
try:
|
|
||||||
return _primary_collator.strcmp(a, b)
|
|
||||||
except AttributeError:
|
|
||||||
return primary_collator().strcmp(a, b)
|
|
||||||
|
|
||||||
def primary_find(pat, src):
|
|
||||||
'find that ignores case and accents on letters'
|
|
||||||
if _icu_not_ok:
|
|
||||||
from calibre.utils.filenames import ascii_text
|
|
||||||
return py_find(ascii_text(pat), ascii_text(src))
|
|
||||||
return primary_icu_find(pat, src)
|
|
||||||
|
|
||||||
def primary_icu_find(pat, src):
|
|
||||||
try:
|
|
||||||
return icu_find(_primary_collator, pat, src)
|
|
||||||
except AttributeError:
|
|
||||||
return icu_find(primary_collator(), pat, src)
|
|
||||||
|
|
||||||
def primary_sort_key(val):
|
|
||||||
'A sort key that ignores case and diacritics'
|
|
||||||
if _icu_not_ok:
|
|
||||||
from calibre.utils.filenames import ascii_text
|
|
||||||
return ascii_text(val).lower()
|
|
||||||
try:
|
|
||||||
return _primary_collator.sort_key(val)
|
|
||||||
except AttributeError:
|
|
||||||
return primary_collator().sort_key(val)
|
|
||||||
|
|
||||||
def primary_startswith(a, b):
|
|
||||||
if _icu_not_ok:
|
|
||||||
from calibre.utils.filenames import ascii_text
|
|
||||||
return ascii_text(a).lower().startswith(ascii_text(b).lower())
|
|
||||||
try:
|
|
||||||
return icu_startswith(_primary_collator, a, b)
|
|
||||||
except AttributeError:
|
|
||||||
return icu_startswith(primary_collator(), a, b)
|
|
||||||
|
|
||||||
def collation_order(a):
|
|
||||||
if _icu_not_ok:
|
|
||||||
return (ord(a[0]), 1) if a else (0, 0)
|
|
||||||
try:
|
|
||||||
return icu_collation_order(_sort_collator, a)
|
|
||||||
except AttributeError:
|
|
||||||
return icu_collation_order(sort_collator(), a)
|
|
||||||
|
|
||||||
################################################################################
|
|
||||||
|
|
||||||
def test(): # {{{
|
|
||||||
from calibre import prints
|
|
||||||
# Data {{{
|
|
||||||
german = '''
|
|
||||||
Sonntag
|
|
||||||
Montag
|
|
||||||
Dienstag
|
|
||||||
Januar
|
|
||||||
Februar
|
|
||||||
März
|
|
||||||
Fuße
|
|
||||||
Fluße
|
|
||||||
Flusse
|
|
||||||
flusse
|
|
||||||
fluße
|
|
||||||
flüße
|
|
||||||
flüsse
|
|
||||||
'''
|
|
||||||
german_good = '''
|
|
||||||
Dienstag
|
|
||||||
Februar
|
|
||||||
flusse
|
|
||||||
Flusse
|
|
||||||
fluße
|
|
||||||
Fluße
|
|
||||||
flüsse
|
|
||||||
flüße
|
|
||||||
Fuße
|
|
||||||
Januar
|
|
||||||
März
|
|
||||||
Montag
|
|
||||||
Sonntag'''
|
|
||||||
french = '''
|
|
||||||
dimanche
|
|
||||||
lundi
|
|
||||||
mardi
|
|
||||||
janvier
|
|
||||||
février
|
|
||||||
mars
|
|
||||||
déjà
|
|
||||||
Meme
|
|
||||||
deja
|
|
||||||
même
|
|
||||||
dejà
|
|
||||||
bpef
|
|
||||||
bœg
|
|
||||||
Boef
|
|
||||||
Mémé
|
|
||||||
bœf
|
|
||||||
boef
|
|
||||||
bnef
|
|
||||||
pêche
|
|
||||||
pèché
|
|
||||||
pêché
|
|
||||||
pêche
|
|
||||||
pêché'''
|
|
||||||
french_good = '''
|
|
||||||
bnef
|
|
||||||
boef
|
|
||||||
Boef
|
|
||||||
bœf
|
|
||||||
bœg
|
|
||||||
bpef
|
|
||||||
deja
|
|
||||||
dejà
|
|
||||||
déjà
|
|
||||||
dimanche
|
|
||||||
février
|
|
||||||
janvier
|
|
||||||
lundi
|
|
||||||
mardi
|
|
||||||
mars
|
|
||||||
Meme
|
|
||||||
Mémé
|
|
||||||
même
|
|
||||||
pèché
|
|
||||||
pêche
|
|
||||||
pêche
|
|
||||||
pêché
|
|
||||||
pêché'''
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
def create(l):
|
|
||||||
l = l.decode('utf-8').splitlines()
|
|
||||||
return [x.strip() for x in l if x.strip()]
|
|
||||||
|
|
||||||
def test_strcmp(entries):
|
|
||||||
for x in entries:
|
|
||||||
for y in entries:
|
|
||||||
if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
|
|
||||||
print 'strcmp failed for %r, %r'%(x, y)
|
|
||||||
|
|
||||||
german = create(german)
|
|
||||||
c = _icu.Collator('de')
|
|
||||||
c.numeric = True
|
|
||||||
gs = list(sorted(german, key=c.sort_key))
|
|
||||||
if gs != create(german_good):
|
|
||||||
print 'German sorting failed'
|
|
||||||
return
|
|
||||||
print
|
|
||||||
french = create(french)
|
|
||||||
c = _icu.Collator('fr')
|
|
||||||
c.numeric = True
|
|
||||||
fs = list(sorted(french, key=c.sort_key))
|
|
||||||
if fs != create(french_good):
|
|
||||||
print 'French sorting failed (note that French fails with icu < 4.6)'
|
|
||||||
return
|
|
||||||
test_strcmp(german + french)
|
|
||||||
|
|
||||||
print '\nTesting case transforms in current locale'
|
|
||||||
from calibre.utils.titlecase import titlecase
|
|
||||||
for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
|
|
||||||
print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
|
|
||||||
print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
|
|
||||||
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
|
|
||||||
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
|
|
||||||
print
|
|
||||||
|
|
||||||
print '\nTesting primary collation'
|
|
||||||
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
|
|
||||||
u'Štepánek':u'ŠtepaneK'}.iteritems():
|
|
||||||
if primary_strcmp(k, v) != 0:
|
|
||||||
prints('primary_strcmp() failed with %s != %s'%(k, v))
|
|
||||||
return
|
|
||||||
if primary_find(v, u' '+k)[0] != 1:
|
|
||||||
prints('primary_find() failed with %s not in %s'%(v, k))
|
|
||||||
return
|
|
||||||
|
|
||||||
n = character_name(safe_chr(0x1f431))
|
|
||||||
if n != u'CAT FACE':
|
|
||||||
raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
|
|
||||||
|
|
||||||
global _primary_collator
|
|
||||||
orig = _primary_collator
|
|
||||||
_primary_collator = _icu.Collator('es')
|
|
||||||
if primary_strcmp(u'peña', u'pena') == 0:
|
|
||||||
print 'Primary collation in Spanish locale failed'
|
|
||||||
return
|
|
||||||
_primary_collator = orig
|
|
||||||
|
|
||||||
print '\nTesting contractions'
|
|
||||||
c = _icu.Collator('cs')
|
|
||||||
if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
|
|
||||||
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
|
|
||||||
u'S\u030c', u'R\u030c']):
|
|
||||||
print 'Contractions for the Czech language failed'
|
|
||||||
return
|
|
||||||
|
|
||||||
print '\nTesting startswith'
|
|
||||||
p = primary_startswith
|
|
||||||
if (not p('asd', 'asd') or not p('asd', 'A') or
|
|
||||||
not p('x', '')):
|
|
||||||
print 'startswith() failed'
|
|
||||||
return
|
|
||||||
|
|
||||||
print '\nTesting collation_order()'
|
|
||||||
for group in [
|
|
||||||
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
|
|
||||||
('calibre', 'Charon', 'Collins'),
|
|
||||||
('01', '1'),
|
|
||||||
('1', '11', '13'),
|
|
||||||
]:
|
|
||||||
last = None
|
|
||||||
for x in group:
|
|
||||||
val = icu_collation_order(sort_collator(), x)
|
|
||||||
if val[1] != 1:
|
|
||||||
prints('collation_order() returned incorrect length for', x)
|
|
||||||
if last is None:
|
|
||||||
last = val
|
|
||||||
else:
|
|
||||||
if val != last:
|
|
||||||
prints('collation_order() returned incorrect value for', x)
|
|
||||||
last = val
|
|
||||||
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
def test_roundtrip():
|
|
||||||
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
|
||||||
rp = _icu.roundtrip(r)
|
|
||||||
if rp != r:
|
|
||||||
raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
|
|
||||||
|
|
||||||
def test_normalize_performance():
|
|
||||||
import os
|
|
||||||
if not os.path.exists('t.txt'):
|
|
||||||
return
|
|
||||||
raw = open('t.txt', 'rb').read().decode('utf-8')
|
|
||||||
print (len(raw))
|
|
||||||
import time, unicodedata
|
|
||||||
st = time.time()
|
|
||||||
count = 100
|
|
||||||
for i in xrange(count):
|
|
||||||
normalize(raw)
|
|
||||||
print ('ICU time:', time.time() - st)
|
|
||||||
st = time.time()
|
|
||||||
for i in xrange(count):
|
|
||||||
unicodedata.normalize('NFC', unicode(raw))
|
|
||||||
print ('py time:', time.time() - st)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
test_roundtrip()
|
|
||||||
test_normalize_performance()
|
|
||||||
test()
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user