From 8b2c0f732f386147c3b92a83dd518e5ed7bbd611 Mon Sep 17 00:00:00 2001 From: Zoe Roux Date: Mon, 5 Feb 2024 23:20:28 +0100 Subject: [PATCH] Improve xem titles sanitizing --- .../implementations/themoviedatabase.py | 4 +++- scanner/providers/implementations/thexem.py | 22 +++++++++++++------ scanner/scanner/parser/guess.py | 11 ++++++---- scanner/scanner/parser/rules.py | 6 ++--- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/scanner/providers/implementations/themoviedatabase.py b/scanner/providers/implementations/themoviedatabase.py index 734b7a49..c6a8a4f0 100644 --- a/scanner/providers/implementations/themoviedatabase.py +++ b/scanner/providers/implementations/themoviedatabase.py @@ -463,7 +463,9 @@ class TheMovieDatabase(Provider): else None ) if tvdb_id is None: - logging.info("Tvdb could not be found, trying xem name lookup for %s", name) + logging.info( + "Tvdb could not be found, trying xem name lookup for %s", name + ) _, tvdb_id = await self._xem.get_show_override("tvdb", old_name) if tvdb_id is not None: ( diff --git a/scanner/providers/implementations/thexem.py b/scanner/providers/implementations/thexem.py index 2d65a75c..af0945a5 100644 --- a/scanner/providers/implementations/thexem.py +++ b/scanner/providers/implementations/thexem.py @@ -8,6 +8,17 @@ from providers.utils import ProviderError from scanner.cache import cache +def clean(s: str): + s = s.lower() + # remove content of () (guessit does not allow them as part of a name) + s = re.sub(r"\([^)]*\)", "", s) + # remove separators + s = re.sub(r"[:\-_/\\&|,;.=\"'+~~@`ー]+", " ", s) + # remove subsequent spaces (that may be introduced above) + s = re.sub(r" +", " ", s) + return s + + class TheXem: def __init__(self, client: ClientSession) -> None: self._client = client @@ -61,12 +72,13 @@ class TheXem: self, provider: Literal["tvdb"] | Literal["anidb"], show_name: str ): map = await self.get_map(provider) + show_name = clean(show_name) for [id, v] in map.items(): # Only the first element is a string (the show name) so we need to ignore the type hint master_show_name: str = v[0] # type: ignore for x in v[1:]: [(name, season)] = x.items() - if show_name.lower() == name.lower(): + if show_name == clean(name): return master_show_name, id return None, None @@ -76,11 +88,12 @@ class TheXem: map = await self.get_map(provider) if id not in map: return None + show_name = clean(show_name) # Ignore the first element, this is the show name has a string for x in map[id][1:]: [(name, season)] = x.items() # TODO: replace .lower() with something a bit smarter - if show_name.lower() == name.lower(): + if show_name == clean(name): return season return None @@ -133,11 +146,6 @@ class TheXem: map = await self.get_map(provider) titles = [] - def clean(s: str): - s = s.lower() - s = re.sub(r"\([^)]*\)", "", s) # remove content of () (guessit does not allow them as part of a name) - return re.sub(r"[\W_]+", "", s) # remove non alphanum content (it does keep non us chars like kanjis or accents) - for x in map.values(): # Only the first element is a string (the show name) so we need to ignore the type hint master_show_name: str = x[0] # type: ignore diff --git a/scanner/scanner/parser/guess.py b/scanner/scanner/parser/guess.py index 7f36cf4e..f37f52b7 100644 --- a/scanner/scanner/parser/guess.py +++ b/scanner/scanner/parser/guess.py @@ -1,5 +1,11 @@ #!/usr/bin/env python3 +if __name__ == "__main__": + import sys + from pathlib import Path + + sys.path.append(str(Path(f"{__file__}/../../..").resolve())) + from guessit.api import default_api from typing import cast, List from rebulk import Rebulk @@ -29,14 +35,11 @@ def guessit(name: str, *, xem_titles: List[str] = []): if __name__ == "__main__": import sys import json - from pathlib import Path + from providers.implementations.thexem import TheXem from guessit.jsonutils import GuessitEncoder from aiohttp import ClientSession import asyncio - sys.path.append(str(Path(f"{__file__}/../../..").resolve())) - from providers.implementations.thexem import TheXem - async def main(): async with ClientSession() as client: xem = TheXem(client) diff --git a/scanner/scanner/parser/rules.py b/scanner/scanner/parser/rules.py index d3c9456c..6faeb5c5 100644 --- a/scanner/scanner/parser/rules.py +++ b/scanner/scanner/parser/rules.py @@ -3,9 +3,10 @@ from typing import Any, List, Optional, cast from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS from rebulk.match import Matches, Match -import re from copy import copy +from providers.implementations.thexem import clean + class EpisodeTitlePromotion(Rule): """Promote "episode_title" to "episode" when the title is in fact the episode number @@ -250,8 +251,5 @@ class XemFixup(Rule): new_title.end = nmatch[0].end new_title.value = f"{title.value}{hole}{nmatch[0].value}" - def clean(s: str): - return re.sub(r"[\W_]+", "", s.lower()) - if clean(new_title.value) in context["xem_titles"]: return [[title, nmatch[0]], [new_title]]