mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Allow merging of icu branch into trunk by falling back to the old icu module if the old binary plugin is detected.
This commit is contained in:
parent
b76cc3e9ab
commit
1f2aa8a55b
@ -247,6 +247,16 @@ def contractions(col=None):
|
||||
|
||||
################################################################################
|
||||
|
||||
if not hasattr(_icu, 'change_case'):
|
||||
print ('You are running from source with an outdated calibre binary install. You'
|
||||
' should update the main calibre binary to at least version 1.28.')
|
||||
# Dont creak calibre for people running from source until the
|
||||
# next binary is available witht he update icu module
|
||||
from calibre.utils.icu_old import * # noqa
|
||||
|
||||
def primary_contains(pat, src):
|
||||
return primary_find(pat, src)[0] != -1
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.utils.icu_test import run
|
||||
run(verbosity=4)
|
||||
|
541
src/calibre/utils/icu_old.py
Normal file
541
src/calibre/utils/icu_old.py
Normal file
@ -0,0 +1,541 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
# Setup code {{{
|
||||
import sys
|
||||
from functools import partial
|
||||
|
||||
from calibre.constants import plugins
|
||||
from calibre.utils.config_base import tweaks
|
||||
|
||||
_icu = _collator = _primary_collator = _sort_collator = _numeric_collator = None
|
||||
_locale = None
|
||||
|
||||
_none = u''
|
||||
_none2 = b''
|
||||
|
||||
def get_locale():
|
||||
global _locale
|
||||
if _locale is None:
|
||||
from calibre.utils.localization import get_lang
|
||||
if tweaks['locale_for_sorting']:
|
||||
_locale = tweaks['locale_for_sorting']
|
||||
else:
|
||||
_locale = get_lang()
|
||||
return _locale
|
||||
|
||||
def load_icu():
|
||||
global _icu
|
||||
if _icu is None:
|
||||
_icu = plugins['icu'][0]
|
||||
if _icu is None:
|
||||
print 'Loading ICU failed with: ', plugins['icu'][1]
|
||||
else:
|
||||
if not getattr(_icu, 'ok', False):
|
||||
print 'icu not ok'
|
||||
_icu = None
|
||||
return _icu
|
||||
|
||||
def load_collator():
|
||||
'The default collator for most locales takes both case and accented letters into account'
|
||||
global _collator
|
||||
if _collator is None:
|
||||
icu = load_icu()
|
||||
if icu is not None:
|
||||
_collator = icu.Collator(get_locale())
|
||||
return _collator
|
||||
|
||||
def primary_collator():
|
||||
'Ignores case differences and accented characters'
|
||||
global _primary_collator
|
||||
if _primary_collator is None:
|
||||
_primary_collator = _collator.clone()
|
||||
_primary_collator.strength = _icu.UCOL_PRIMARY
|
||||
return _primary_collator
|
||||
|
||||
def sort_collator():
|
||||
'Ignores case differences and recognizes numbers in strings'
|
||||
global _sort_collator
|
||||
if _sort_collator is None:
|
||||
_sort_collator = _collator.clone()
|
||||
_sort_collator.strength = _icu.UCOL_SECONDARY
|
||||
if tweaks['numeric_collation']:
|
||||
try:
|
||||
_sort_collator.numeric = True
|
||||
except AttributeError:
|
||||
pass
|
||||
return _sort_collator
|
||||
|
||||
def py_sort_key(obj):
|
||||
if not obj:
|
||||
return _none
|
||||
return obj.lower()
|
||||
|
||||
def icu_sort_key(collator, obj):
|
||||
if not obj:
|
||||
return _none2
|
||||
try:
|
||||
try:
|
||||
return _sort_collator.sort_key(obj)
|
||||
except AttributeError:
|
||||
return sort_collator().sort_key(obj)
|
||||
except TypeError:
|
||||
if isinstance(obj, unicode):
|
||||
obj = obj.replace(u'\0', u'')
|
||||
else:
|
||||
obj = obj.replace(b'\0', b'')
|
||||
return _sort_collator.sort_key(obj)
|
||||
|
||||
def numeric_collator():
|
||||
global _numeric_collator
|
||||
_numeric_collator = _collator.clone()
|
||||
_numeric_collator.strength = _icu.UCOL_SECONDARY
|
||||
_numeric_collator.numeric = True
|
||||
return _numeric_collator
|
||||
|
||||
def numeric_sort_key(obj):
|
||||
'Uses natural sorting for numbers inside strings so something2 will sort before something10'
|
||||
if not obj:
|
||||
return _none2
|
||||
try:
|
||||
try:
|
||||
return _numeric_collator.sort_key(obj)
|
||||
except AttributeError:
|
||||
return numeric_collator().sort_key(obj)
|
||||
except TypeError:
|
||||
if isinstance(obj, unicode):
|
||||
obj = obj.replace(u'\0', u'')
|
||||
else:
|
||||
obj = obj.replace(b'\0', b'')
|
||||
return _numeric_collator.sort_key(obj)
|
||||
|
||||
def icu_change_case(upper, locale, obj):
|
||||
func = _icu.upper if upper else _icu.lower
|
||||
try:
|
||||
return func(locale, obj)
|
||||
except TypeError:
|
||||
if isinstance(obj, unicode):
|
||||
obj = obj.replace(u'\0', u'')
|
||||
else:
|
||||
obj = obj.replace(b'\0', b'')
|
||||
return func(locale, obj)
|
||||
|
||||
def py_find(pattern, source):
|
||||
pos = source.find(pattern)
|
||||
if pos > -1:
|
||||
return pos, len(pattern)
|
||||
return -1, -1
|
||||
|
||||
def character_name(string):
|
||||
try:
|
||||
try:
|
||||
return _icu.character_name(unicode(string)) or None
|
||||
except AttributeError:
|
||||
import unicodedata
|
||||
return unicodedata.name(unicode(string)[0], None)
|
||||
except (TypeError, ValueError, KeyError):
|
||||
pass
|
||||
|
||||
def character_name_from_code(code):
|
||||
try:
|
||||
try:
|
||||
return _icu.character_name_from_code(code) or ''
|
||||
except AttributeError:
|
||||
import unicodedata
|
||||
return unicodedata.name(py_safe_chr(code), '')
|
||||
except (TypeError, ValueError, KeyError):
|
||||
return ''
|
||||
|
||||
if sys.maxunicode >= 0x10ffff:
|
||||
try:
|
||||
py_safe_chr = unichr
|
||||
except NameError:
|
||||
py_safe_chr = chr
|
||||
else:
|
||||
def py_safe_chr(i):
|
||||
# Narrow builds of python cannot represent code point > 0xffff as a
|
||||
# single character, so we need our own implementation of unichr
|
||||
# that returns them as a surrogate pair
|
||||
return (b"\U%s" % (hex(i)[2:].zfill(8))).decode('unicode-escape')
|
||||
|
||||
def safe_chr(code):
|
||||
try:
|
||||
return _icu.chr(code)
|
||||
except AttributeError:
|
||||
return py_safe_chr(code)
|
||||
|
||||
def normalize(text, mode='NFC'):
|
||||
# This is very slightly slower than using unicodedata.normalize, so stick with
|
||||
# that unless you have very good reasons not too. Also, it's speed
|
||||
# decreases on wide python builds, where conversion to/from ICU's string
|
||||
# representation is slower.
|
||||
try:
|
||||
return _icu.normalize(_nmodes[mode], unicode(text))
|
||||
except (AttributeError, KeyError):
|
||||
import unicodedata
|
||||
return unicodedata.normalize(mode, unicode(text))
|
||||
|
||||
def icu_find(collator, pattern, source):
|
||||
try:
|
||||
return collator.find(pattern, source)
|
||||
except TypeError:
|
||||
return collator.find(unicode(pattern), unicode(source))
|
||||
|
||||
def icu_startswith(collator, a, b):
|
||||
try:
|
||||
return collator.startswith(a, b)
|
||||
except TypeError:
|
||||
return collator.startswith(unicode(a), unicode(b))
|
||||
|
||||
def py_case_sensitive_sort_key(obj):
|
||||
if not obj:
|
||||
return _none
|
||||
return obj
|
||||
|
||||
def icu_case_sensitive_sort_key(collator, obj):
|
||||
if not obj:
|
||||
return _none2
|
||||
return collator.sort_key(obj)
|
||||
|
||||
def icu_strcmp(collator, a, b):
|
||||
return collator.strcmp(lower(a), lower(b))
|
||||
|
||||
def py_strcmp(a, b):
|
||||
return cmp(a.lower(), b.lower())
|
||||
|
||||
def icu_case_sensitive_strcmp(collator, a, b):
|
||||
return collator.strcmp(a, b)
|
||||
|
||||
def icu_capitalize(s):
|
||||
s = lower(s)
|
||||
return s.replace(s[0], upper(s[0]), 1) if s else s
|
||||
|
||||
_cmap = {}
|
||||
def icu_contractions(collator):
|
||||
global _cmap
|
||||
ans = _cmap.get(collator, None)
|
||||
if ans is None:
|
||||
ans = collator.contractions()
|
||||
ans = frozenset(filter(None, ans)) if ans else {}
|
||||
_cmap[collator] = ans
|
||||
return ans
|
||||
|
||||
def icu_collation_order(collator, a):
|
||||
try:
|
||||
return collator.collation_order(a)
|
||||
except TypeError:
|
||||
return collator.collation_order(unicode(a))
|
||||
|
||||
load_icu()
|
||||
load_collator()
|
||||
_icu_not_ok = _icu is None or _collator is None
|
||||
icu_unicode_version = getattr(_icu, 'unicode_version', None)
|
||||
_nmodes = {m:getattr(_icu, 'UNORM_'+m, None) for m in ('NFC', 'NFD', 'NFKC', 'NFKD', 'NONE', 'DEFAULT', 'FCD')}
|
||||
|
||||
try:
|
||||
senc = sys.getdefaultencoding()
|
||||
if not senc or senc.lower() == 'ascii':
|
||||
_icu.set_default_encoding('utf-8')
|
||||
del senc
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
fenc = sys.getfilesystemencoding()
|
||||
if not fenc or fenc.lower() == 'ascii':
|
||||
_icu.set_filesystem_encoding('utf-8')
|
||||
del fenc
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
################# The string functions ########################################
|
||||
|
||||
sort_key = py_sort_key if _icu_not_ok else partial(icu_sort_key, _collator)
|
||||
|
||||
strcmp = py_strcmp if _icu_not_ok else partial(icu_strcmp, _collator)
|
||||
|
||||
case_sensitive_sort_key = py_case_sensitive_sort_key if _icu_not_ok else \
|
||||
partial(icu_case_sensitive_sort_key, _collator)
|
||||
|
||||
case_sensitive_strcmp = cmp if _icu_not_ok else icu_case_sensitive_strcmp
|
||||
|
||||
upper = (lambda s: s.upper()) if _icu_not_ok else \
|
||||
partial(icu_change_case, True, get_locale())
|
||||
|
||||
lower = (lambda s: s.lower()) if _icu_not_ok else \
|
||||
partial(icu_change_case, False, get_locale())
|
||||
|
||||
title_case = (lambda s: s.title()) if _icu_not_ok else \
|
||||
partial(_icu.title, get_locale())
|
||||
|
||||
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
|
||||
(lambda s: icu_capitalize(s))
|
||||
|
||||
find = (py_find if _icu_not_ok else partial(icu_find, _collator))
|
||||
|
||||
contractions = ((lambda : {}) if _icu_not_ok else (partial(icu_contractions,
|
||||
_collator)))
|
||||
|
||||
def primary_strcmp(a, b):
|
||||
'strcmp that ignores case and accents on letters'
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return py_strcmp(ascii_text(a), ascii_text(b))
|
||||
try:
|
||||
return _primary_collator.strcmp(a, b)
|
||||
except AttributeError:
|
||||
return primary_collator().strcmp(a, b)
|
||||
|
||||
def primary_find(pat, src):
|
||||
'find that ignores case and accents on letters'
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return py_find(ascii_text(pat), ascii_text(src))
|
||||
return primary_icu_find(pat, src)
|
||||
|
||||
def primary_icu_find(pat, src):
|
||||
try:
|
||||
return icu_find(_primary_collator, pat, src)
|
||||
except AttributeError:
|
||||
return icu_find(primary_collator(), pat, src)
|
||||
|
||||
def primary_sort_key(val):
|
||||
'A sort key that ignores case and diacritics'
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return ascii_text(val).lower()
|
||||
try:
|
||||
return _primary_collator.sort_key(val)
|
||||
except AttributeError:
|
||||
return primary_collator().sort_key(val)
|
||||
|
||||
def primary_startswith(a, b):
|
||||
if _icu_not_ok:
|
||||
from calibre.utils.filenames import ascii_text
|
||||
return ascii_text(a).lower().startswith(ascii_text(b).lower())
|
||||
try:
|
||||
return icu_startswith(_primary_collator, a, b)
|
||||
except AttributeError:
|
||||
return icu_startswith(primary_collator(), a, b)
|
||||
|
||||
def collation_order(a):
|
||||
if _icu_not_ok:
|
||||
return (ord(a[0]), 1) if a else (0, 0)
|
||||
try:
|
||||
return icu_collation_order(_sort_collator, a)
|
||||
except AttributeError:
|
||||
return icu_collation_order(sort_collator(), a)
|
||||
|
||||
################################################################################
|
||||
|
||||
def test(): # {{{
|
||||
from calibre import prints
|
||||
# Data {{{
|
||||
german = '''
|
||||
Sonntag
|
||||
Montag
|
||||
Dienstag
|
||||
Januar
|
||||
Februar
|
||||
März
|
||||
Fuße
|
||||
Fluße
|
||||
Flusse
|
||||
flusse
|
||||
fluße
|
||||
flüße
|
||||
flüsse
|
||||
'''
|
||||
german_good = '''
|
||||
Dienstag
|
||||
Februar
|
||||
flusse
|
||||
Flusse
|
||||
fluße
|
||||
Fluße
|
||||
flüsse
|
||||
flüße
|
||||
Fuße
|
||||
Januar
|
||||
März
|
||||
Montag
|
||||
Sonntag'''
|
||||
french = '''
|
||||
dimanche
|
||||
lundi
|
||||
mardi
|
||||
janvier
|
||||
février
|
||||
mars
|
||||
déjà
|
||||
Meme
|
||||
deja
|
||||
même
|
||||
dejà
|
||||
bpef
|
||||
bœg
|
||||
Boef
|
||||
Mémé
|
||||
bœf
|
||||
boef
|
||||
bnef
|
||||
pêche
|
||||
pèché
|
||||
pêché
|
||||
pêche
|
||||
pêché'''
|
||||
french_good = '''
|
||||
bnef
|
||||
boef
|
||||
Boef
|
||||
bœf
|
||||
bœg
|
||||
bpef
|
||||
deja
|
||||
dejà
|
||||
déjà
|
||||
dimanche
|
||||
février
|
||||
janvier
|
||||
lundi
|
||||
mardi
|
||||
mars
|
||||
Meme
|
||||
Mémé
|
||||
même
|
||||
pèché
|
||||
pêche
|
||||
pêche
|
||||
pêché
|
||||
pêché'''
|
||||
# }}}
|
||||
|
||||
def create(l):
|
||||
l = l.decode('utf-8').splitlines()
|
||||
return [x.strip() for x in l if x.strip()]
|
||||
|
||||
def test_strcmp(entries):
|
||||
for x in entries:
|
||||
for y in entries:
|
||||
if strcmp(x, y) != cmp(sort_key(x), sort_key(y)):
|
||||
print 'strcmp failed for %r, %r'%(x, y)
|
||||
|
||||
german = create(german)
|
||||
c = _icu.Collator('de')
|
||||
c.numeric = True
|
||||
gs = list(sorted(german, key=c.sort_key))
|
||||
if gs != create(german_good):
|
||||
print 'German sorting failed'
|
||||
return
|
||||
print
|
||||
french = create(french)
|
||||
c = _icu.Collator('fr')
|
||||
c.numeric = True
|
||||
fs = list(sorted(french, key=c.sort_key))
|
||||
if fs != create(french_good):
|
||||
print 'French sorting failed (note that French fails with icu < 4.6)'
|
||||
return
|
||||
test_strcmp(german + french)
|
||||
|
||||
print '\nTesting case transforms in current locale'
|
||||
from calibre.utils.titlecase import titlecase
|
||||
for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
|
||||
print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
|
||||
print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
|
||||
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
|
||||
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
|
||||
print
|
||||
|
||||
print '\nTesting primary collation'
|
||||
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse',
|
||||
u'Štepánek':u'ŠtepaneK'}.iteritems():
|
||||
if primary_strcmp(k, v) != 0:
|
||||
prints('primary_strcmp() failed with %s != %s'%(k, v))
|
||||
return
|
||||
if primary_find(v, u' '+k)[0] != 1:
|
||||
prints('primary_find() failed with %s not in %s'%(v, k))
|
||||
return
|
||||
|
||||
n = character_name(safe_chr(0x1f431))
|
||||
if n != u'CAT FACE':
|
||||
raise ValueError('Failed to get correct character name for 0x1f431: %r != %r' % n, u'CAT FACE')
|
||||
|
||||
global _primary_collator
|
||||
orig = _primary_collator
|
||||
_primary_collator = _icu.Collator('es')
|
||||
if primary_strcmp(u'peña', u'pena') == 0:
|
||||
print 'Primary collation in Spanish locale failed'
|
||||
return
|
||||
_primary_collator = orig
|
||||
|
||||
print '\nTesting contractions'
|
||||
c = _icu.Collator('cs')
|
||||
if icu_contractions(c) != frozenset([u'Z\u030c', u'z\u030c', u'Ch',
|
||||
u'C\u030c', u'ch', u'cH', u'c\u030c', u's\u030c', u'r\u030c', u'CH',
|
||||
u'S\u030c', u'R\u030c']):
|
||||
print 'Contractions for the Czech language failed'
|
||||
return
|
||||
|
||||
print '\nTesting startswith'
|
||||
p = primary_startswith
|
||||
if (not p('asd', 'asd') or not p('asd', 'A') or
|
||||
not p('x', '')):
|
||||
print 'startswith() failed'
|
||||
return
|
||||
|
||||
print '\nTesting collation_order()'
|
||||
for group in [
|
||||
('Šaa', 'Smith', 'Solženicyn', 'Štepánek'),
|
||||
('calibre', 'Charon', 'Collins'),
|
||||
('01', '1'),
|
||||
('1', '11', '13'),
|
||||
]:
|
||||
last = None
|
||||
for x in group:
|
||||
val = icu_collation_order(sort_collator(), x)
|
||||
if val[1] != 1:
|
||||
prints('collation_order() returned incorrect length for', x)
|
||||
if last is None:
|
||||
last = val
|
||||
else:
|
||||
if val != last:
|
||||
prints('collation_order() returned incorrect value for', x)
|
||||
last = val
|
||||
|
||||
# }}}
|
||||
|
||||
def test_roundtrip():
|
||||
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
||||
rp = _icu.roundtrip(r)
|
||||
if rp != r:
|
||||
raise ValueError(u'Roundtripping failed: %r != %r' % (r, rp))
|
||||
|
||||
def test_normalize_performance():
|
||||
import os
|
||||
if not os.path.exists('t.txt'):
|
||||
return
|
||||
raw = open('t.txt', 'rb').read().decode('utf-8')
|
||||
print (len(raw))
|
||||
import time, unicodedata
|
||||
st = time.time()
|
||||
count = 100
|
||||
for i in xrange(count):
|
||||
normalize(raw)
|
||||
print ('ICU time:', time.time() - st)
|
||||
st = time.time()
|
||||
for i in xrange(count):
|
||||
unicodedata.normalize('NFC', unicode(raw))
|
||||
print ('py time:', time.time() - st)
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_roundtrip()
|
||||
test_normalize_performance()
|
||||
test()
|
||||
|
Loading…
x
Reference in New Issue
Block a user