mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a regex based accent removal function
This commit is contained in:
parent
0683be2a2c
commit
b3d266d737
@ -294,18 +294,29 @@ string_length = len
|
||||
utf16_length = _icu.utf16_length
|
||||
|
||||
|
||||
def remove_accents(txt: str) -> str:
|
||||
t = getattr(remove_accents, 'transliterator', None)
|
||||
def remove_accents_icu(txt: str) -> str:
|
||||
t = getattr(remove_accents_icu, 'transliterator', None)
|
||||
if t is None:
|
||||
t = _icu.Transliterator('remove_accents', '''\
|
||||
:: NFD (NFC);
|
||||
:: [:Nonspacing Mark:] Remove;
|
||||
:: NFC (NFD);
|
||||
''')
|
||||
setattr(remove_accents, 'transliterator', t)
|
||||
setattr(remove_accents_icu, 'transliterator', t)
|
||||
return t.transliterate(txt)
|
||||
|
||||
|
||||
def remove_accents_regex(txt: str) -> str:
|
||||
pat = getattr(remove_accents_regex, 'pat', None)
|
||||
if pat is None:
|
||||
import regex, unicodedata
|
||||
pat = regex.compile(r'\p{Mn}', flags=regex.UNICODE)
|
||||
setattr(remove_accents_regex, 'pat', pat)
|
||||
setattr(remove_accents_regex, 'normalize', unicodedata.normalize)
|
||||
normalize = remove_accents_regex.normalize
|
||||
return normalize('NFKC', pat.sub('', normalize('NFKD', txt)))
|
||||
|
||||
|
||||
################################################################################
|
||||
if __name__ == '__main__':
|
||||
from calibre.utils.icu_test import run
|
||||
|
@ -229,10 +229,13 @@ class TestICU(unittest.TestCase):
|
||||
self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
|
||||
|
||||
def test_remove_accents(self):
|
||||
for q, expected in {
|
||||
'MännÄr': 'MannAr', 'Peña': 'Pena', 'Kátia': 'Katia',
|
||||
}.items():
|
||||
self.ae(expected, icu.remove_accents(q))
|
||||
for func in (icu.remove_accents_icu, icu.remove_accents_regex):
|
||||
for q, expected in {
|
||||
'MännÄr': 'MannAr', 'Peña': 'Pena', 'Kátia': 'Katia',
|
||||
'Málaga': 'Malaga', 'François': 'Francois', 'Phút Hơn': 'Phut Hon',
|
||||
'中文':'中文'
|
||||
}.items():
|
||||
self.ae(expected, func(q))
|
||||
|
||||
|
||||
def find_tests():
|
||||
|
Loading…
x
Reference in New Issue
Block a user