Try stuff with rendering & guessit rules

This commit is contained in:
Zoe Roux 2025-05-06 02:14:35 +02:00
parent 461333a90f
commit 01d2dc25d6
No known key found for this signature in database
4 changed files with 79 additions and 28 deletions

View File

@ -4,7 +4,7 @@ app = FastAPI(
title="Scanner",
description="API to control the long running scanner or interacting with external databases (themoviedb, tvdb...)\n\n"
+ "Most of those APIs are for admins only.",
openapi_prefix="/scanner",
root_path="/scanner",
# lifetime=smth
)

View File

@ -9,7 +9,7 @@ if __name__ == "__main__":
from guessit.api import default_api
from typing import cast, List, Any
from rebulk import Rebulk
from rebulk.match import MatchesDict
from rebulk.match import Match
try:
from . import rules
@ -26,17 +26,22 @@ def guessit(
*,
expected_titles: List[str] = [],
extra_flags: dict[str, Any] = {},
) -> MatchesDict:
return default_api.guessit(
) -> dict[str, list[Match]]:
rendering = []
ret = default_api.guessit(
name,
{
"episode_prefer_number": True,
"excludes": "language",
"expected_title": expected_titles,
"enforce_list": True,
"advanced": True,
"rendering": rendering,
}
| extra_flags,
)
print(rendering)
return ret
# Only used to test locally

View File

@ -1,10 +1,12 @@
# Read that for examples/rules: https://github.com/pymedusa/Medusa/blob/master/medusa/name_parser/rules/rules.py
import hashlib
from copy import copy
from logging import getLogger
from typing import Any, List, Optional, cast
from rebulk import Rule, RemoveMatch, AppendMatch, POST_PROCESS
from rebulk.match import Matches, Match
from copy import copy
from rebulk import POST_PROCESS, AppendMatch, RemoveMatch, Rule
from rebulk.match import Match, Matches
logger = getLogger(__name__)
@ -67,7 +69,7 @@ class UnlistTitles(Rule):
# Check if titles are next to each other, if they are not ignore it.
next: List[Match] = matches.next(title) # type: ignore
if not next or next[0] != nmatch:
logger.warn(f"Ignoring potential part of title: {nmatch.value}")
logger.warning(f"Ignoring potential part of title: {nmatch.value}")
continue
title.end = nmatch.end
@ -176,7 +178,7 @@ class SeasonYearDedup(Rule):
"""
# This rules does the opposite of the YearSeason rule of guessit (with POST_PROCESS priority)
# To overide it, we need the -1. (rule: https://github.com/guessit-io/guessit/blob/develop/guessit/rules/processors.py#L195)
# To override it, we need the -1. (rule: https://github.com/guessit-io/guessit/blob/develop/guessit/rules/processors.py#L195)
priority = POST_PROCESS - 1
consequence = RemoveMatch
@ -185,3 +187,29 @@ class SeasonYearDedup(Rule):
year: List[Match] = matches.named("year") # type: ignore
if len(season) == 1 and len(year) == 1 and season[0].value == year[0].value:
return season
# class RenderingDedup(Rule):
# """Compute rendering (sha of path - version/part)
#
# Example: "One Piece (1999) v2 152 part2.mkv"
# Computes: sha("One Piece (1999) 152.mkv")
# ```
# """
#
# priority = POST_PROCESS + 100000
# consequence = AppendMatch
#
# def when(self, matches: Matches, context: dict[str, list[str]]) -> Any:
# ret = hashlib.new("sha256")
#
# value: list[Match] = sorted(
# list(matches) + matches.holes(), # type: ignore
# key=lambda m: m.start,
# )
# for m in value:
# if m.name == "part" or m.name == "version":
# continue
# ret.update(cast(str, m.raw).encode("utf-8"))
# context["rendering"] = [ret.hexdigest()]
# return [Match(start=0, end=1, value=ret.hexdigest(), raw="", name="rendering")]

View File

@ -1,6 +1,7 @@
from hashlib import sha256
from itertools import zip_longest
from logging import getLogger
from typing import Awaitable, Callable, Literal
from typing import Awaitable, Callable, Literal, Optional, cast
from .guess.guess import guessit
from .models.videos import Guess, Video
@ -15,37 +16,43 @@ pipeline: list[Callable[[str, Guess], Awaitable[Guess]]] = [
async def identify(path: str) -> Video:
raw = guessit(path, expected_titles=[])
raw = guessit(
path,
expected_titles=[],
extra_flags={"advanced": True},
)
# guessit should only return one (according to the doc)
title: str = raw.get("title", [])[0]
kind: Literal["movie"] | Literal["episode"] = raw.get("type", [])[0]
version: int = raw.get("version", [])[0]
title = raw.get("title", [])[0]
kind = raw.get("type", [])[0]
version = next(iter(raw.get("version", [])), None)
# apparently guessit can return multiples but tbh idk what to do with
# multiples part. we'll just ignore them for now
part: int = raw.get("part", [])[0]
part = next(iter(raw.get("part", [])), None)
years: list[int] = raw.get("year", [])
seasons: list[int] = raw.get("season", [])
episodes: list[int] = raw.get("episode", [])
years = raw.get("year", [])
seasons = raw.get("season", [])
episodes = raw.get("episode", [])
rendering = path[:version.start] + path[version.end:]
print(raw)
guess = Guess(
title=title,
kind=kind,
extraKind=None,
years=years,
title=cast(str, title.value),
kind=cast(Literal["episode"] | Literal["movie"], kind.value),
extra_kind=None,
years=[cast(int, y.value) for y in years],
episodes=[
Guess.Episode(season=s, episode=e)
Guess.Episode(season=cast(int, s.value), episode=cast(int, e.value))
for s, e in zip_longest(
seasons,
episodes,
fillvalue=seasons[-1] if len(seasons) < len(episodes) else episodes[-1],
)
],
# TODO: add external ids parsing in guessit
external_id={},
from_="guessit",
raw=raw,
raw={k: [x.value for x in v] for k, v in raw.items()},
)
for step in pipeline:
@ -56,8 +63,19 @@ async def identify(path: str) -> Video:
return Video(
path=path,
rendering="",
part=part,
version=version,
rendering=sha256(path.encode()).hexdigest(),
part=cast(int, part.value) if part else None,
version=cast(int, version.value) if version else 1,
guess=guess,
)
if __name__ == "__main__":
import asyncio
import sys
async def main():
ret = await identify(sys.argv[1])
print(ret.model_dump_json(indent=4, by_alias=True))
asyncio.run(main())