Clean up collator management

This commit is contained in:
Kovid Goyal 2022-04-24 12:44:50 +05:30
parent 6e7fd724fd
commit 9814b8060e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -7,13 +7,13 @@ __docformat__ = 'restructuredtext en'
# Setup code {{{ # Setup code {{{
import codecs import codecs
import sys import sys
from functools import lru_cache
from calibre.utils.config_base import tweaks, prefs from calibre.utils.config_base import prefs, tweaks
from calibre_extensions import icu as _icu from calibre_extensions import icu as _icu
from polyglot.builtins import cmp from polyglot.builtins import cmp
_locale = _collator = _primary_collator = _sort_collator = _non_numeric_sort_collator = _numeric_collator = None _locale = None
_case_sensitive_collator = _primary_no_punc_collator = None
cmp cmp
_none = '' _none = ''
@ -48,91 +48,72 @@ except:
del is_ascii del is_ascii
def collator(): @lru_cache(maxsize=32)
global _collator, _locale def collator(strength=None, numeric=None, ignore_alternate_chars=None, upper_first=None):
if _collator is None: global _locale
if _locale is None: if _locale is None:
if tweaks['locale_for_sorting']:
_locale = tweaks['locale_for_sorting']
else:
from calibre.utils.localization import get_lang from calibre.utils.localization import get_lang
if tweaks['locale_for_sorting']: _locale = get_lang()
_locale = tweaks['locale_for_sorting'] if strength is None and numeric is None and ignore_alternate_chars is None and upper_first is None:
else:
_locale = get_lang()
try: try:
_collator = _icu.Collator(_locale) ans = _icu.Collator(_locale)
except Exception as e: except Exception as e:
print(f'Failed to load collator for locale: {_locale!r} with error {e!r}, using English') print(f'Failed to load collator for locale: {_locale!r} with error {e!r}, using English', file=sys.stderr)
_collator = _icu.Collator('en') ans = _icu.Collator('en')
return _collator else:
ans = collator().clone()
if strength is not None:
ans.strength = strength
if numeric is not None:
ans.numeric = numeric
if upper_first is not None:
ans.upper_first = upper_first
if ignore_alternate_chars is not None:
try:
ans.set_attribute(_icu.UCOL_ALTERNATE_HANDLING, _icu.UCOL_SHIFTED if ignore_alternate_chars else _icu.UCOL_NON_IGNORABLE)
except AttributeError:
pass # people running from source without latest binary
return ans
def change_locale(locale=None): def change_locale(locale=None):
global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator, _non_numeric_sort_collator global _locale
global _primary_no_punc_collator
_collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = _non_numeric_sort_collator = None
_primary_no_punc_collator = None
_locale = locale _locale = locale
collator.cache_clear()
def primary_collator(): def primary_collator():
'Ignores case differences and accented characters' 'Ignores case differences and accented chars'
global _primary_collator return collator(strength=_icu.UCOL_PRIMARY)
if _primary_collator is None:
_primary_collator = collator().clone()
_primary_collator.strength = _icu.UCOL_PRIMARY
return _primary_collator
def primary_collator_without_punctuation(): def primary_collator_without_punctuation():
'Ignores case differences, accented characters and punctuation' 'Ignores space and punctuation and case differences and accented chars'
global _primary_no_punc_collator return collator(strength=_icu.UCOL_PRIMARY, ignore_alternate_chars=True)
if _primary_no_punc_collator is None:
_primary_no_punc_collator = collator().clone()
_primary_no_punc_collator.strength = _icu.UCOL_PRIMARY
try:
_primary_no_punc_collator.set_attribute(_icu.UCOL_ALTERNATE_HANDLING, _icu.UCOL_SHIFTED)
except AttributeError:
pass # people running from source without latest binary
return _primary_no_punc_collator
def sort_collator(): def sort_collator():
'Ignores case differences and recognizes numbers in strings (if the tweak is set)' 'Ignores case differences and recognizes numbers in strings (if the tweak is set)'
global _sort_collator return collator(strength=_icu.UCOL_SECONDARY, numeric=prefs['numeric_collation'])
if _sort_collator is None:
_sort_collator = collator().clone()
_sort_collator.strength = _icu.UCOL_SECONDARY
_sort_collator.numeric = prefs['numeric_collation']
return _sort_collator
def non_numeric_sort_collator(): def non_numeric_sort_collator():
'Ignores case differences only' 'Ignores case differences only'
global _non_numeric_sort_collator return collator(strength=_icu.UCOL_SECONDARY, numeric=False)
if _non_numeric_sort_collator is None:
_non_numeric_sort_collator = collator().clone()
_non_numeric_sort_collator.strength = _icu.UCOL_SECONDARY
_non_numeric_sort_collator.numeric = False
return _non_numeric_sort_collator
def numeric_collator(): def numeric_collator():
'Uses natural sorting for numbers inside strings so something2 will sort before something10' 'Uses natural sorting for numbers inside strings so something2 will sort before something10'
global _numeric_collator return collator(strength=_icu.UCOL_SECONDARY, numeric=True)
if _numeric_collator is None:
_numeric_collator = collator().clone()
_numeric_collator.strength = _icu.UCOL_SECONDARY
_numeric_collator.numeric = True
return _numeric_collator
def case_sensitive_collator(): def case_sensitive_collator():
'Always sorts upper case letter before lower case' 'Always sorts upper case letter before lower case'
global _case_sensitive_collator return collator(numeric=prefs['numeric_collation'], upper_first=True)
if _case_sensitive_collator is None:
_case_sensitive_collator = collator().clone()
_case_sensitive_collator.numeric = sort_collator().numeric
_case_sensitive_collator.upper_first = True
return _case_sensitive_collator
def make_sort_key_func(collator_function, func_name='sort_key'): def make_sort_key_func(collator_function, func_name='sort_key'):
@ -273,10 +254,8 @@ def normalize(text, mode='NFC'):
def contractions(col=None): def contractions(col=None):
global _cmap global _cmap
col = col or _collator col = col or collator()
if col is None: ans = _cmap.get(col, None)
col = collator()
ans = _cmap.get(collator, None)
if ans is None: if ans is None:
ans = col.contractions() ans = col.contractions()
ans = frozenset(filter(None, ans)) ans = frozenset(filter(None, ans))
@ -326,7 +305,8 @@ def remove_accents_icu(txt: str) -> str:
def remove_accents_regex(txt: str) -> str: def remove_accents_regex(txt: str) -> str:
pat = getattr(remove_accents_regex, 'pat', None) pat = getattr(remove_accents_regex, 'pat', None)
if pat is None: if pat is None:
import regex, unicodedata import regex
import unicodedata
pat = regex.compile(r'\p{Mn}', flags=regex.UNICODE) pat = regex.compile(r'\p{Mn}', flags=regex.UNICODE)
setattr(remove_accents_regex, 'pat', pat) setattr(remove_accents_regex, 'pat', pat)
setattr(remove_accents_regex, 'normalize', unicodedata.normalize) setattr(remove_accents_regex, 'normalize', unicodedata.normalize)