From 01d2dc25d669a90642b5234cb9211c974358a3fa Mon Sep 17 00:00:00 2001 From: Zoe Roux Date: Tue, 6 May 2025 02:14:35 +0200 Subject: [PATCH] Try stuff with rendering & guessit rules --- scanner/scanner/__init__.py | 2 +- scanner/scanner/guess/guess.py | 11 +++++-- scanner/scanner/guess/rules.py | 38 ++++++++++++++++++++--- scanner/scanner/identify.py | 56 ++++++++++++++++++++++------------ 4 files changed, 79 insertions(+), 28 deletions(-) diff --git a/scanner/scanner/__init__.py b/scanner/scanner/__init__.py index f2cd0c63..d5d02af1 100644 --- a/scanner/scanner/__init__.py +++ b/scanner/scanner/__init__.py @@ -4,7 +4,7 @@ app = FastAPI( title="Scanner", description="API to control the long running scanner or interacting with external databases (themoviedb, tvdb...)\n\n" + "Most of those APIs are for admins only.", - openapi_prefix="/scanner", + root_path="/scanner", # lifetime=smth ) diff --git a/scanner/scanner/guess/guess.py b/scanner/scanner/guess/guess.py index 0b483e1e..d17f8c2b 100644 --- a/scanner/scanner/guess/guess.py +++ b/scanner/scanner/guess/guess.py @@ -9,7 +9,7 @@ if __name__ == "__main__": from guessit.api import default_api from typing import cast, List, Any from rebulk import Rebulk -from rebulk.match import MatchesDict +from rebulk.match import Match try: from . import rules @@ -26,17 +26,22 @@ def guessit( *, expected_titles: List[str] = [], extra_flags: dict[str, Any] = {}, -) -> MatchesDict: - return default_api.guessit( +) -> dict[str, list[Match]]: + rendering = [] + ret = default_api.guessit( name, { "episode_prefer_number": True, "excludes": "language", "expected_title": expected_titles, "enforce_list": True, + "advanced": True, + "rendering": rendering, } | extra_flags, ) + print(rendering) + return ret # Only used to test locally diff --git a/scanner/scanner/guess/rules.py b/scanner/scanner/guess/rules.py index f304412c..c7fe6a2b 100644 --- a/scanner/scanner/guess/rules.py +++ b/scanner/scanner/guess/rules.py @@ -1,10 +1,12 @@ # Read that for examples/rules: https://github.com/pymedusa/Medusa/blob/master/medusa/name_parser/rules/rules.py +import hashlib +from copy import copy from logging import getLogger from typing import Any, List, Optional, cast -from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS -from rebulk.match import Matches, Match -from copy import copy + +from rebulk import POST_PROCESS, AppendMatch, RemoveMatch, Rule +from rebulk.match import Match, Matches logger = getLogger(__name__) @@ -67,7 +69,7 @@ class UnlistTitles(Rule): # Check if titles are next to each other, if they are not ignore it. next: List[Match] = matches.next(title) # type: ignore if not next or next[0] != nmatch: - logger.warn(f"Ignoring potential part of title: {nmatch.value}") + logger.warning(f"Ignoring potential part of title: {nmatch.value}") continue title.end = nmatch.end @@ -176,7 +178,7 @@ class SeasonYearDedup(Rule): """ # This rules does the opposite of the YearSeason rule of guessit (with POST_PROCESS priority) - # To overide it, we need the -1. (rule: https://github.com/guessit-io/guessit/blob/develop/guessit/rules/processors.py#L195) + # To override it, we need the -1. (rule: https://github.com/guessit-io/guessit/blob/develop/guessit/rules/processors.py#L195) priority = POST_PROCESS - 1 consequence = RemoveMatch @@ -185,3 +187,29 @@ class SeasonYearDedup(Rule): year: List[Match] = matches.named("year") # type: ignore if len(season) == 1 and len(year) == 1 and season[0].value == year[0].value: return season + + +# class RenderingDedup(Rule): +# """Compute rendering (sha of path - version/part) +# +# Example: "One Piece (1999) v2 152 part2.mkv" +# Computes: sha("One Piece (1999) 152.mkv") +# ``` +# """ +# +# priority = POST_PROCESS + 100000 +# consequence = AppendMatch +# +# def when(self, matches: Matches, context: dict[str, list[str]]) -> Any: +# ret = hashlib.new("sha256") +# +# value: list[Match] = sorted( +# list(matches) + matches.holes(), # type: ignore +# key=lambda m: m.start, +# ) +# for m in value: +# if m.name == "part" or m.name == "version": +# continue +# ret.update(cast(str, m.raw).encode("utf-8")) +# context["rendering"] = [ret.hexdigest()] +# return [Match(start=0, end=1, value=ret.hexdigest(), raw="", name="rendering")] diff --git a/scanner/scanner/identify.py b/scanner/scanner/identify.py index 8fe40047..c72031db 100644 --- a/scanner/scanner/identify.py +++ b/scanner/scanner/identify.py @@ -1,6 +1,7 @@ +from hashlib import sha256 from itertools import zip_longest from logging import getLogger -from typing import Awaitable, Callable, Literal +from typing import Awaitable, Callable, Literal, Optional, cast from .guess.guess import guessit from .models.videos import Guess, Video @@ -15,37 +16,43 @@ pipeline: list[Callable[[str, Guess], Awaitable[Guess]]] = [ async def identify(path: str) -> Video: - raw = guessit(path, expected_titles=[]) + raw = guessit( + path, + expected_titles=[], + extra_flags={"advanced": True}, + ) # guessit should only return one (according to the doc) - title: str = raw.get("title", [])[0] - kind: Literal["movie"] | Literal["episode"] = raw.get("type", [])[0] - version: int = raw.get("version", [])[0] + title = raw.get("title", [])[0] + kind = raw.get("type", [])[0] + version = next(iter(raw.get("version", [])), None) # apparently guessit can return multiples but tbh idk what to do with # multiples part. we'll just ignore them for now - part: int = raw.get("part", [])[0] + part = next(iter(raw.get("part", [])), None) - years: list[int] = raw.get("year", []) - seasons: list[int] = raw.get("season", []) - episodes: list[int] = raw.get("episode", []) + years = raw.get("year", []) + seasons = raw.get("season", []) + episodes = raw.get("episode", []) + rendering = path[:version.start] + path[version.end:] + + print(raw) guess = Guess( - title=title, - kind=kind, - extraKind=None, - years=years, + title=cast(str, title.value), + kind=cast(Literal["episode"] | Literal["movie"], kind.value), + extra_kind=None, + years=[cast(int, y.value) for y in years], episodes=[ - Guess.Episode(season=s, episode=e) + Guess.Episode(season=cast(int, s.value), episode=cast(int, e.value)) for s, e in zip_longest( seasons, episodes, fillvalue=seasons[-1] if len(seasons) < len(episodes) else episodes[-1], ) ], - # TODO: add external ids parsing in guessit external_id={}, from_="guessit", - raw=raw, + raw={k: [x.value for x in v] for k, v in raw.items()}, ) for step in pipeline: @@ -56,8 +63,19 @@ async def identify(path: str) -> Video: return Video( path=path, - rendering="", - part=part, - version=version, + rendering=sha256(path.encode()).hexdigest(), + part=cast(int, part.value) if part else None, + version=cast(int, version.value) if version else 1, guess=guess, ) + + +if __name__ == "__main__": + import asyncio + import sys + + async def main(): + ret = await identify(sys.argv[1]) + print(ret.model_dump_json(indent=4, by_alias=True)) + + asyncio.run(main())