From f7e801e57407b75d4ea81238a6ae876a87a66ee4 Mon Sep 17 00:00:00 2001 From: Zoe Roux Date: Thu, 20 Nov 2025 09:44:00 +0100 Subject: [PATCH] Chunk identify scans --- scanner/scanner/fsscan.py | 43 +++++----- scanner/scanner/identifiers/identify.py | 107 +++++++++++++----------- 2 files changed, 81 insertions(+), 69 deletions(-) diff --git a/scanner/scanner/fsscan.py b/scanner/scanner/fsscan.py index 018d33a3..d738a734 100644 --- a/scanner/scanner/fsscan.py +++ b/scanner/scanner/fsscan.py @@ -1,3 +1,5 @@ +import asyncio +import itertools import os import re from contextlib import asynccontextmanager @@ -111,30 +113,33 @@ class FsScanner: logger.error("Unexpected error while monitoring files.", exc_info=e) async def _register(self, videos: list[str] | set[str]): - # TODO: we should probably chunk those - vids: list[Video] = [] - for path in list(videos): + async def process(path: str): try: vid = await identify(path) - vid = self._match(vid) - vids.append(vid) + return self._match(vid) except Exception as e: logger.error("Couldn't identify %s.", path, exc_info=e) - created = await self._client.create_videos(vids) + return None - await self._requests.enqueue( - [ - Request( - kind=x.guess.kind, - title=x.guess.title, - year=next(iter(x.guess.years), None), - external_id=x.guess.external_id, - videos=[Request.Video(id=x.id, episodes=x.guess.episodes)], - ) - for x in created - if not any(x.entries) and x.guess.kind != "extra" - ] - ) + for batch in itertools.batched(videos, 20): + vids = await asyncio.gather(*(process(path) for path in batch)) + created = await self._client.create_videos( + [v for v in vids if v is not None] + ) + + await self._requests.enqueue( + [ + Request( + kind=x.guess.kind, + title=x.guess.title, + year=next(iter(x.guess.years), None), + external_id=x.guess.external_id, + videos=[Request.Video(id=x.id, episodes=x.guess.episodes)], + ) + for x in created + if not any(x.entries) and x.guess.kind != "extra" + ] + ) def _match(self, video: Video) -> Video: video.for_ = [] diff --git a/scanner/scanner/identifiers/identify.py b/scanner/scanner/identifiers/identify.py index 90e61f1a..7b51ce45 100644 --- a/scanner/scanner/identifiers/identify.py +++ b/scanner/scanner/identifiers/identify.py @@ -1,15 +1,18 @@ +import os from collections.abc import Awaitable from hashlib import sha256 from itertools import zip_longest from logging import getLogger from typing import Callable, Literal, cast +from opentelemetry import trace from rebulk.match import Match from ..models.videos import Guess, Video from .guess.guess import guessit logger = getLogger(__name__) +tracer = trace.get_tracer("kyoo.scanner") pipeline: list[Callable[[str, Guess], Awaitable[Guess]]] = [ # TODO: add nfo scanner @@ -19,62 +22,66 @@ pipeline: list[Callable[[str, Guess], Awaitable[Guess]]] = [ async def identify(path: str) -> Video: - raw = guessit(path, expected_titles=[]) + with tracer.start_as_current_span(f"identify {os.path.basename(path)}") as span: + span.set_attribute("video.path", path) - # guessit should only return one (according to the doc) - title = raw.get("title", [])[0] - kind = raw.get("type", [])[0] - version = next(iter(raw.get("version", [])), None) - # apparently guessit can return multiples but tbh idk what to do with - # multiples part. we'll just ignore them for now - part = next(iter(raw.get("part", [])), None) + raw = guessit(path, expected_titles=[]) - years = raw.get("year", []) - seasons = raw.get("season", []) - episodes = raw.get("episode", []) + # guessit should only return one (according to the doc) + title = raw.get("title", [])[0] + kind = raw.get("type", [])[0] + version = next(iter(raw.get("version", [])), None) + # apparently guessit can return multiples but tbh idk what to do with + # multiples part. we'll just ignore them for now + part = next(iter(raw.get("part", [])), None) - # just strip the version & part number from the path - rendering_path = "".join( - c - for i, c in enumerate(path) - if not (version and version.start <= i < version.end) - and not (part and part.start <= i < part.end) - ) + years = raw.get("year", []) + seasons = raw.get("season", []) + episodes = raw.get("episode", []) - guess = Guess( - title=cast(str, title.value), - kind=cast(Literal["episode", "movie"], kind.value), - extra_kind=None, - years=[cast(int, y.value) for y in years], - episodes=[ - Guess.Episode(season=cast(int, s.value), episode=cast(int, e.value)) - for s, e in zip_longest( - seasons, - episodes, - fillvalue=seasons[-1] if any(seasons) else Match(0, 0, value=1), - ) - ], - external_id={}, - from_="guessit", - raw={ - k: [x.value if x.value is int else str(x.value) for x in v] - for k, v in raw.items() - }, - ) + # just strip the version & part number from the path + rendering_path = "".join( + c + for i, c in enumerate(path) + if not (version and version.start <= i < version.end) + and not (part and part.start <= i < part.end) + ) - for step in pipeline: - try: - guess = await step(path, guess) - except Exception as e: - logger.error("Couldn't run %s.", step.__name__, exc_info=e) + guess = Guess( + title=cast(str, title.value), + kind=cast(Literal["episode", "movie"], kind.value), + extra_kind=None, + years=[cast(int, y.value) for y in years], + episodes=[ + Guess.Episode(season=cast(int, s.value), episode=cast(int, e.value)) + for s, e in zip_longest( + seasons, + episodes, + fillvalue=seasons[-1] if any(seasons) else Match(0, 0, value=1), + ) + ], + external_id={}, + from_="guessit", + raw={ + k: [x.value if x.value is int else str(x.value) for x in v] + for k, v in raw.items() + }, + ) + span.set_attribute("video.name", guess.title) - return Video( - path=path, - rendering=sha256(rendering_path.encode()).hexdigest(), - part=cast(int, part.value) if part else None, - version=cast(int, version.value) if version else 1, - guess=guess, - ) + for step in pipeline: + try: + guess = await step(path, guess) + except Exception as e: + logger.error("Couldn't run %s.", step.__name__, exc_info=e) + + return Video( + path=path, + rendering=sha256(rendering_path.encode()).hexdigest(), + part=cast(int, part.value) if part else None, + version=cast(int, version.value) if version else 1, + guess=guess, + ) if __name__ == "__main__":