diff --git a/scanner/providers/implementations/thexem.py b/scanner/providers/implementations/thexem.py index b0a971de..2d65a75c 100644 --- a/scanner/providers/implementations/thexem.py +++ b/scanner/providers/implementations/thexem.py @@ -1,3 +1,4 @@ +import re import logging from typing import Dict, List, Literal from aiohttp import ClientSession @@ -133,7 +134,9 @@ class TheXem: titles = [] def clean(s: str): - return s.lower().replace(" ", "") + s = s.lower() + s = re.sub(r"\([^)]*\)", "", s) # remove content of () (guessit does not allow them as part of a name) + return re.sub(r"[\W_]+", "", s) # remove non alphanum content (it does keep non us chars like kanjis or accents) for x in map.values(): # Only the first element is a string (the show name) so we need to ignore the type hint diff --git a/scanner/scanner/parser/rules.py b/scanner/scanner/parser/rules.py index 32ed4243..d3c9456c 100644 --- a/scanner/scanner/parser/rules.py +++ b/scanner/scanner/parser/rules.py @@ -3,6 +3,7 @@ from typing import Any, List, Optional, cast from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS from rebulk.match import Matches, Match +import re from copy import copy @@ -249,5 +250,8 @@ class XemFixup(Rule): new_title.end = nmatch[0].end new_title.value = f"{title.value}{hole}{nmatch[0].value}" - if new_title.value.lower().replace(" ", "") in context["xem_titles"]: + def clean(s: str): + return re.sub(r"[\W_]+", "", s.lower()) + + if clean(new_title.value) in context["xem_titles"]: return [[title, nmatch[0]], [new_title]]