Remove non letters from titles for xem lookup

2025-07-08 18:54:22 -04:00 · 2024-02-05 22:36:57 +01:00 · 2024-02-05 22:36:57 +01:00 · a055dfac5b
commit a055dfac5b
parent e772a798f7
2 changed files with 9 additions and 2 deletions
--- a/scanner/providers/implementations/thexem.py
+++ b/scanner/providers/implementations/thexem.py
@ -1,3 +1,4 @@
+import re
 import logging
 from typing import Dict, List, Literal
 from aiohttp import ClientSession
@ -133,7 +134,9 @@ class TheXem:
 		titles = []

 		def clean(s: str):
-			return s.lower().replace(" ", "")
+			s = s.lower()
+			s = re.sub(r"\([^)]*\)", "", s) # remove content of () (guessit does not allow them as part of a name)
+			return re.sub(r"[\W_]+", "", s) # remove non alphanum content (it does keep non us chars like kanjis or accents)

 		for x in map.values():
 			# Only the first element is a string (the show name) so we need to ignore the type hint
--- a/scanner/scanner/parser/rules.py
+++ b/scanner/scanner/parser/rules.py
@ -3,6 +3,7 @@
 from typing import Any, List, Optional, cast
 from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS
 from rebulk.match import Matches, Match
+import re
 from copy import copy


@ -249,5 +250,8 @@ class XemFixup(Rule):
 		new_title.end = nmatch[0].end
 		new_title.value = f"{title.value}{hole}{nmatch[0].value}"

-		if new_title.value.lower().replace(" ", "") in context["xem_titles"]:
+		def clean(s: str):
+			return re.sub(r"[\W_]+", "", s.lower())
+
+		if clean(new_title.value) in context["xem_titles"]:
 			return [[title, nmatch[0]], [new_title]]