Add a regex based accent removal function

This commit is contained in:
Kovid Goyal 2022-04-23 10:45:39 +05:30
parent 0683be2a2c
commit b3d266d737
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 21 additions and 7 deletions

View File

@ -294,18 +294,29 @@ string_length = len
utf16_length = _icu.utf16_length
def remove_accents(txt: str) -> str:
t = getattr(remove_accents, 'transliterator', None)
def remove_accents_icu(txt: str) -> str:
t = getattr(remove_accents_icu, 'transliterator', None)
if t is None:
t = _icu.Transliterator('remove_accents', '''\
:: NFD (NFC);
:: [:Nonspacing Mark:] Remove;
:: NFC (NFD);
''')
setattr(remove_accents, 'transliterator', t)
setattr(remove_accents_icu, 'transliterator', t)
return t.transliterate(txt)
def remove_accents_regex(txt: str) -> str:
pat = getattr(remove_accents_regex, 'pat', None)
if pat is None:
import regex, unicodedata
pat = regex.compile(r'\p{Mn}', flags=regex.UNICODE)
setattr(remove_accents_regex, 'pat', pat)
setattr(remove_accents_regex, 'normalize', unicodedata.normalize)
normalize = remove_accents_regex.normalize
return normalize('NFKC', pat.sub('', normalize('NFKD', txt)))
################################################################################
if __name__ == '__main__':
from calibre.utils.icu_test import run

View File

@ -229,10 +229,13 @@ class TestICU(unittest.TestCase):
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
def test_remove_accents(self):
for q, expected in {
'MännÄr': 'MannAr', 'Peña': 'Pena', 'Kátia': 'Katia',
}.items():
self.ae(expected, icu.remove_accents(q))
for func in (icu.remove_accents_icu, icu.remove_accents_regex):
for q, expected in {
'MännÄr': 'MannAr', 'Peña': 'Pena', 'Kátia': 'Katia',
'Málaga': 'Malaga', 'François': 'Francois', 'Phút Hơn': 'Phut Hon',
'中文':'中文'
}.items():
self.ae(expected, func(q))
def find_tests():