From b3d266d737574ef8bb839ae953424ea383498f12 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 23 Apr 2022 10:45:39 +0530 Subject: [PATCH] Add a regex based accent removal function --- src/calibre/utils/icu.py | 17 ++++++++++++++--- src/calibre/utils/icu_test.py | 11 +++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 1c37540995..db31a729a5 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -294,18 +294,29 @@ string_length = len utf16_length = _icu.utf16_length -def remove_accents(txt: str) -> str: - t = getattr(remove_accents, 'transliterator', None) +def remove_accents_icu(txt: str) -> str: + t = getattr(remove_accents_icu, 'transliterator', None) if t is None: t = _icu.Transliterator('remove_accents', '''\ :: NFD (NFC); :: [:Nonspacing Mark:] Remove; :: NFC (NFD); ''') - setattr(remove_accents, 'transliterator', t) + setattr(remove_accents_icu, 'transliterator', t) return t.transliterate(txt) +def remove_accents_regex(txt: str) -> str: + pat = getattr(remove_accents_regex, 'pat', None) + if pat is None: + import regex, unicodedata + pat = regex.compile(r'\p{Mn}', flags=regex.UNICODE) + setattr(remove_accents_regex, 'pat', pat) + setattr(remove_accents_regex, 'normalize', unicodedata.normalize) + normalize = remove_accents_regex.normalize + return normalize('NFKC', pat.sub('', normalize('NFKD', txt))) + + ################################################################################ if __name__ == '__main__': from calibre.utils.icu_test import run diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index f1a45a4c0d..2e13a45b4f 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -229,10 +229,13 @@ class TestICU(unittest.TestCase): self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos)) def test_remove_accents(self): - for q, expected in { - 'MännÄr': 'MannAr', 'Peña': 'Pena', 'Kátia': 'Katia', - }.items(): - self.ae(expected, icu.remove_accents(q)) + for func in (icu.remove_accents_icu, icu.remove_accents_regex): + for q, expected in { + 'MännÄr': 'MannAr', 'Peña': 'Pena', 'Kátia': 'Katia', + 'Málaga': 'Malaga', 'François': 'Francois', 'Phút Hơn': 'Phut Hon', + '中文':'中文' + }.items(): + self.ae(expected, func(q)) def find_tests():