Improve xem titles sanitizing

This commit is contained in:
Zoe Roux 2024-02-05 23:20:28 +01:00
parent fbd76594ea
commit 8b2c0f732f
4 changed files with 27 additions and 16 deletions

View File

@ -463,7 +463,9 @@ class TheMovieDatabase(Provider):
else None
)
if tvdb_id is None:
logging.info("Tvdb could not be found, trying xem name lookup for %s", name)
logging.info(
"Tvdb could not be found, trying xem name lookup for %s", name
)
_, tvdb_id = await self._xem.get_show_override("tvdb", old_name)
if tvdb_id is not None:
(

View File

@ -8,6 +8,17 @@ from providers.utils import ProviderError
from scanner.cache import cache
def clean(s: str):
s = s.lower()
# remove content of () (guessit does not allow them as part of a name)
s = re.sub(r"\([^)]*\)", "", s)
# remove separators
s = re.sub(r"[:\-_/\\&|,;.=\"'+~@`ー]+", " ", s)
# remove subsequent spaces (that may be introduced above)
s = re.sub(r" +", " ", s)
return s
class TheXem:
def __init__(self, client: ClientSession) -> None:
self._client = client
@ -61,12 +72,13 @@ class TheXem:
self, provider: Literal["tvdb"] | Literal["anidb"], show_name: str
):
map = await self.get_map(provider)
show_name = clean(show_name)
for [id, v] in map.items():
# Only the first element is a string (the show name) so we need to ignore the type hint
master_show_name: str = v[0] # type: ignore
for x in v[1:]:
[(name, season)] = x.items()
if show_name.lower() == name.lower():
if show_name == clean(name):
return master_show_name, id
return None, None
@ -76,11 +88,12 @@ class TheXem:
map = await self.get_map(provider)
if id not in map:
return None
show_name = clean(show_name)
# Ignore the first element, this is the show name has a string
for x in map[id][1:]:
[(name, season)] = x.items()
# TODO: replace .lower() with something a bit smarter
if show_name.lower() == name.lower():
if show_name == clean(name):
return season
return None
@ -133,11 +146,6 @@ class TheXem:
map = await self.get_map(provider)
titles = []
def clean(s: str):
s = s.lower()
s = re.sub(r"\([^)]*\)", "", s) # remove content of () (guessit does not allow them as part of a name)
return re.sub(r"[\W_]+", "", s) # remove non alphanum content (it does keep non us chars like kanjis or accents)
for x in map.values():
# Only the first element is a string (the show name) so we need to ignore the type hint
master_show_name: str = x[0] # type: ignore

View File

@ -1,5 +1,11 @@
#!/usr/bin/env python3
if __name__ == "__main__":
import sys
from pathlib import Path
sys.path.append(str(Path(f"{__file__}/../../..").resolve()))
from guessit.api import default_api
from typing import cast, List
from rebulk import Rebulk
@ -29,14 +35,11 @@ def guessit(name: str, *, xem_titles: List[str] = []):
if __name__ == "__main__":
import sys
import json
from pathlib import Path
from providers.implementations.thexem import TheXem
from guessit.jsonutils import GuessitEncoder
from aiohttp import ClientSession
import asyncio
sys.path.append(str(Path(f"{__file__}/../../..").resolve()))
from providers.implementations.thexem import TheXem
async def main():
async with ClientSession() as client:
xem = TheXem(client)

View File

@ -3,9 +3,10 @@
from typing import Any, List, Optional, cast
from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS
from rebulk.match import Matches, Match
import re
from copy import copy
from providers.implementations.thexem import clean
class EpisodeTitlePromotion(Rule):
"""Promote "episode_title" to "episode" when the title is in fact the episode number
@ -250,8 +251,5 @@ class XemFixup(Rule):
new_title.end = nmatch[0].end
new_title.value = f"{title.value}{hole}{nmatch[0].value}"
def clean(s: str):
return re.sub(r"[\W_]+", "", s.lower())
if clean(new_title.value) in context["xem_titles"]:
return [[title, nmatch[0]], [new_title]]