From b55dd98bce1cefab0b0191e29f80de1a7ed3ba1c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 21 Mar 2023 15:46:27 +0530 Subject: [PATCH] Tag mapper: Ensure tag rules are unicode normalized before matching --- src/calibre/ebooks/metadata/tag_mapper.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/tag_mapper.py b/src/calibre/ebooks/metadata/tag_mapper.py index 4d41f1e616..5143c7fbcd 100644 --- a/src/calibre/ebooks/metadata/tag_mapper.py +++ b/src/calibre/ebooks/metadata/tag_mapper.py @@ -5,6 +5,7 @@ from collections import deque from calibre.utils.icu import lower as icu_lower, upper as icu_upper +from polyglot.builtins import as_unicode def compile_pat(pat): @@ -14,25 +15,29 @@ def compile_pat(pat): def matcher(rule): + import unicodedata + def n(x): + return unicodedata.normalize('NFC', as_unicode(x or '', errors='replace')) + mt = rule['match_type'] if mt == 'one_of': - tags = {icu_lower(x.strip()) for x in rule['query'].split(',')} + tags = {icu_lower(n(x.strip())) for x in rule['query'].split(',')} return lambda x: x in tags if mt == 'not_one_of': - tags = {icu_lower(x.strip()) for x in rule['query'].split(',')} + tags = {icu_lower(n(x.strip())) for x in rule['query'].split(',')} return lambda x: x not in tags if mt == 'matches': - pat = compile_pat(rule['query']) + pat = compile_pat(n(rule['query'])) return lambda x: pat.match(x) is not None if mt == 'not_matches': - pat = compile_pat(rule['query']) + pat = compile_pat(n(rule['query'])) return lambda x: pat.match(x) is None if mt == 'has': - s = icu_lower(rule['query']) + s = icu_lower(n(rule['query'])) return lambda x: s in x return lambda x: False