Improve xem titles sanitizing

This commit is contained in:
Zoe Roux 2024-02-05 23:20:28 +01:00
parent fbd76594ea
commit 8b2c0f732f
4 changed files with 27 additions and 16 deletions

View File

@ -463,7 +463,9 @@ class TheMovieDatabase(Provider):
else None else None
) )
if tvdb_id is None: if tvdb_id is None:
logging.info("Tvdb could not be found, trying xem name lookup for %s", name) logging.info(
"Tvdb could not be found, trying xem name lookup for %s", name
)
_, tvdb_id = await self._xem.get_show_override("tvdb", old_name) _, tvdb_id = await self._xem.get_show_override("tvdb", old_name)
if tvdb_id is not None: if tvdb_id is not None:
( (

View File

@ -8,6 +8,17 @@ from providers.utils import ProviderError
from scanner.cache import cache from scanner.cache import cache
def clean(s: str):
s = s.lower()
# remove content of () (guessit does not allow them as part of a name)
s = re.sub(r"\([^)]*\)", "", s)
# remove separators
s = re.sub(r"[:\-_/\\&|,;.=\"'+~@`ー]+", " ", s)
# remove subsequent spaces (that may be introduced above)
s = re.sub(r" +", " ", s)
return s
class TheXem: class TheXem:
def __init__(self, client: ClientSession) -> None: def __init__(self, client: ClientSession) -> None:
self._client = client self._client = client
@ -61,12 +72,13 @@ class TheXem:
self, provider: Literal["tvdb"] | Literal["anidb"], show_name: str self, provider: Literal["tvdb"] | Literal["anidb"], show_name: str
): ):
map = await self.get_map(provider) map = await self.get_map(provider)
show_name = clean(show_name)
for [id, v] in map.items(): for [id, v] in map.items():
# Only the first element is a string (the show name) so we need to ignore the type hint # Only the first element is a string (the show name) so we need to ignore the type hint
master_show_name: str = v[0] # type: ignore master_show_name: str = v[0] # type: ignore
for x in v[1:]: for x in v[1:]:
[(name, season)] = x.items() [(name, season)] = x.items()
if show_name.lower() == name.lower(): if show_name == clean(name):
return master_show_name, id return master_show_name, id
return None, None return None, None
@ -76,11 +88,12 @@ class TheXem:
map = await self.get_map(provider) map = await self.get_map(provider)
if id not in map: if id not in map:
return None return None
show_name = clean(show_name)
# Ignore the first element, this is the show name has a string # Ignore the first element, this is the show name has a string
for x in map[id][1:]: for x in map[id][1:]:
[(name, season)] = x.items() [(name, season)] = x.items()
# TODO: replace .lower() with something a bit smarter # TODO: replace .lower() with something a bit smarter
if show_name.lower() == name.lower(): if show_name == clean(name):
return season return season
return None return None
@ -133,11 +146,6 @@ class TheXem:
map = await self.get_map(provider) map = await self.get_map(provider)
titles = [] titles = []
def clean(s: str):
s = s.lower()
s = re.sub(r"\([^)]*\)", "", s) # remove content of () (guessit does not allow them as part of a name)
return re.sub(r"[\W_]+", "", s) # remove non alphanum content (it does keep non us chars like kanjis or accents)
for x in map.values(): for x in map.values():
# Only the first element is a string (the show name) so we need to ignore the type hint # Only the first element is a string (the show name) so we need to ignore the type hint
master_show_name: str = x[0] # type: ignore master_show_name: str = x[0] # type: ignore

View File

@ -1,5 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
if __name__ == "__main__":
import sys
from pathlib import Path
sys.path.append(str(Path(f"{__file__}/../../..").resolve()))
from guessit.api import default_api from guessit.api import default_api
from typing import cast, List from typing import cast, List
from rebulk import Rebulk from rebulk import Rebulk
@ -29,14 +35,11 @@ def guessit(name: str, *, xem_titles: List[str] = []):
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
import json import json
from pathlib import Path from providers.implementations.thexem import TheXem
from guessit.jsonutils import GuessitEncoder from guessit.jsonutils import GuessitEncoder
from aiohttp import ClientSession from aiohttp import ClientSession
import asyncio import asyncio
sys.path.append(str(Path(f"{__file__}/../../..").resolve()))
from providers.implementations.thexem import TheXem
async def main(): async def main():
async with ClientSession() as client: async with ClientSession() as client:
xem = TheXem(client) xem = TheXem(client)

View File

@ -3,9 +3,10 @@
from typing import Any, List, Optional, cast from typing import Any, List, Optional, cast
from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS
from rebulk.match import Matches, Match from rebulk.match import Matches, Match
import re
from copy import copy from copy import copy
from providers.implementations.thexem import clean
class EpisodeTitlePromotion(Rule): class EpisodeTitlePromotion(Rule):
"""Promote "episode_title" to "episode" when the title is in fact the episode number """Promote "episode_title" to "episode" when the title is in fact the episode number
@ -250,8 +251,5 @@ class XemFixup(Rule):
new_title.end = nmatch[0].end new_title.end = nmatch[0].end
new_title.value = f"{title.value}{hole}{nmatch[0].value}" new_title.value = f"{title.value}{hole}{nmatch[0].value}"
def clean(s: str):
return re.sub(r"[\W_]+", "", s.lower())
if clean(new_title.value) in context["xem_titles"]: if clean(new_title.value) in context["xem_titles"]:
return [[title, nmatch[0]], [new_title]] return [[title, nmatch[0]], [new_title]]