From 303006cb11811861665f9b507b6910aeefd5f3d2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 23 Apr 2022 12:14:04 +0530 Subject: [PATCH] ICU collator for searching ignoring punctuation --- src/calibre/utils/icu.py | 17 ++++++++++++++++- src/calibre/utils/icu_test.py | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 9b859880a0..d39099c694 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -12,7 +12,8 @@ from calibre.utils.config_base import tweaks, prefs from calibre_extensions import icu as _icu from polyglot.builtins import cmp -_locale = _collator = _primary_collator = _sort_collator = _non_numeric_sort_collator = _numeric_collator = _case_sensitive_collator = None +_locale = _collator = _primary_collator = _sort_collator = _non_numeric_sort_collator = _numeric_collator = None +_case_sensitive_collator = _primary_no_punc_collator = None cmp _none = '' @@ -66,7 +67,9 @@ def collator(): def change_locale(locale=None): global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator, _non_numeric_sort_collator + global _primary_no_punc_collator _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = _non_numeric_sort_collator = None + _primary_no_punc_collator = None _locale = locale @@ -79,6 +82,16 @@ def primary_collator(): return _primary_collator +def primary_collator_without_punctuation(): + 'Ignores case differences, accented characters and punctuation' + global _primary_no_punc_collator + if _primary_no_punc_collator is None: + _primary_no_punc_collator = collator().clone() + _primary_no_punc_collator.strength = _icu.UCOL_PRIMARY + _primary_no_punc_collator.set_attribute(_icu.UCOL_ALTERNATE_HANDLING, _icu.UCOL_SHIFTED) + return _primary_no_punc_collator + + def sort_collator(): 'Ignores case differences and recognizes numbers in strings (if the tweak is set)' global _sort_collator @@ -223,8 +236,10 @@ except AttributeError: # For people running from source find = make_two_arg_func(collator, 'find') primary_find = make_two_arg_func(primary_collator, 'find') +primary_no_punc_find = make_two_arg_func(primary_collator_without_punctuation, 'find') contains = make_two_arg_func(collator, 'contains') primary_contains = make_two_arg_func(primary_collator, 'contains') +primary_no_punc_contains = make_two_arg_func(primary_collator_without_punctuation, 'contains') startswith = make_two_arg_func(collator, 'startswith') primary_startswith = make_two_arg_func(primary_collator, 'startswith') safe_chr = _icu.chr diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 35830fd8ff..548a10ed30 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -114,6 +114,8 @@ class TestICU(unittest.TestCase): self.assertTrue(icu.primary_contains('pena', 'peña')) x = icu.primary_collator() self.ae(x.get_attribute(icu._icu.UCOL_STRENGTH), icu._icu.UCOL_PRIMARY), + self.ae((0, 4), icu.primary_no_punc_find('pena"', 'peña')) + self.ae((0, 13), icu.primary_no_punc_find("typographers", 'typographer’s')) def test_collation_order(self): 'Testing collation ordering'