From 3202c01767ccc141b8b7be3a4e97da23982afedf Mon Sep 17 00:00:00 2001 From: Zoe Roux Date: Sun, 15 Mar 2026 12:38:15 +0100 Subject: [PATCH] Parse animelist.xml --- scanner/pyproject.toml | 1 + scanner/scanner/identifiers/anilist.py | 182 ++++++++++++++++++++++++ scanner/scanner/identifiers/identify.py | 3 +- scanner/scanner/models/videos.py | 1 + scanner/scanner/providers/names.py | 1 + scanner/uv.lock | 15 ++ 6 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 scanner/scanner/identifiers/anilist.py diff --git a/scanner/pyproject.toml b/scanner/pyproject.toml index 2964bd29..5b9fcf31 100644 --- a/scanner/pyproject.toml +++ b/scanner/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "opentelemetry-instrumentation-fastapi>=0.59b0", "opentelemetry-sdk>=1.38.0", "pydantic>=2.11.4", + "pydantic-xml>=2.14.0", "pyjwt[crypto]>=2.10.1", "python-slugify>=8.0.4", "watchfiles>=1.0.5", diff --git a/scanner/scanner/identifiers/anilist.py b/scanner/scanner/identifiers/anilist.py new file mode 100644 index 00000000..1a426564 --- /dev/null +++ b/scanner/scanner/identifiers/anilist.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import re +import unicodedata +from dataclasses import dataclass +from datetime import datetime, timedelta +from functools import cached_property +from logging import getLogger +from typing import Literal + +from aiohttp import ClientSession +from pydantic import field_validator +from pydantic_xml import BaseXmlModel, attr, element + +from ..cache import cache +from ..models.metadataid import EpisodeId, MetadataId +from ..models.videos import Guess +from ..providers.names import ProviderName + +logger = getLogger(__name__) + + +class AnimeTitlesDb(BaseXmlModel, tag="animetitles"): + animes: list[AnimeTitlesEntry] = element(default=[]) + + @classmethod + def get_url(cls): + return "https://raw.githubusercontent.com/Anime-Lists/anime-lists/master/animetitles.xml" + + class AnimeTitlesEntry(BaseXmlModel, tag="anime"): + aid: str = attr() + titles: list[AnimeTitle] = element(default=[]) + + class AnimeTitle( + BaseXmlModel, + tag="title", + nsmap={"xml": "http://www.w3.org/XML/1998/namespace"}, + ): + type: str = attr() + lang: str = attr(ns="xml") + text: str + + +class AnimeListDb(BaseXmlModel, tag="anime-list"): + animes: list[AnimeEntry] = element(default=[]) + + @classmethod + def get_url(cls): + return "https://raw.githubusercontent.com/Anime-Lists/anime-lists/refs/heads/master/anime-list.xml" + + class AnimeEntry(BaseXmlModel, tag="anime"): + anidbid: str = attr() + tvdbid: str | None = attr(default=None) + defaulttvdbseason: int | Literal["a"] | None = attr(default=None) + episodeoffset: int = attr(default=0) + tmdbid: str | None = attr(default=None) + imdbid: str | None = attr(default=None) + name: str | None = element(default=None) + mapping_list: MappingList | None = element(default=[]) + + @field_validator("tmdbid", "imdbid") + @classmethod + def _empty_to_none(cls, v: str | None) -> str | None: + return v or None + + class MappingList(BaseXmlModel, tag="mapping-list"): + mappings: list[EpisodeMapping] = element(default=[]) + + class EpisodeMapping(BaseXmlModel): + anidbseason: int = attr() + tvdbseason: int | None = attr(default=None) + start: int | None = attr(default=None) + end: int | None = attr(default=None) + offset: int = attr(default=0) + text: str | None = None + + @cached_property + def tvdb_mappings(self) -> dict[int, list[int]]: + if self.tvdbseason is None or not self.text: + return {} + ret = {} + for map in self.text.split(";"): + map = map.strip() + if not map or "-" not in map: + continue + [aid, tvdbids] = map.split("-", 1) + try: + ret[int(aid.strip())] = [ + int(x.strip()) for x in tvdbids.split("+") + ] + except ValueError: + continue + return ret + + +@dataclass +class AnimeListData: + fetched_at: datetime + # normalized title -> anidbid + titles: dict[str, str] = {} + # anidbid -> AnimeEntry + animes: dict[str, AnimeListDb.AnimeEntry] = {} + + +@cache(ttl=timedelta(days=30)) +async def get_data() -> AnimeListData: + logger.info("Fetching anime-lists XML databases...") + ret = AnimeListData(fetched_at=datetime.now()) + async with ClientSession() as session: + async with session.get(AnimeTitlesDb.get_url()) as resp: + resp.raise_for_status() + titles = AnimeTitlesDb.from_xml(await resp.read()) + ret.titles = { + normalize_title(title.text): x.aid + for x in titles.animes + for title in x.titles + } + async with session.get(AnimeListDb.get_url()) as resp: + resp.raise_for_status() + db = AnimeListDb.from_xml(await resp.read()) + ret.animes = {entry.anidbid: entry for entry in db.animes} + + logger.info( + "Loaded %d anime titles from animelist-xml.", + len(ret.titles), + ) + return ret + + +def normalize_title(title: str) -> str: + title = unicodedata.normalize("NFD", title) + title = "".join(c for c in title if unicodedata.category(c) != "Mn") + title = title.lower() + title = re.sub(r"[^\w\s]", "", title) + title = re.sub(r"\s+", " ", title).strip() + return title + + +async def anilist(_path: str, guess: Guess) -> Guess: + data = await get_data() + + aid = data.titles.get(guess.title) + if aid is None: + return guess + anime = data.animes.get(aid) + if anime is None: + logger.warning("AniDB id %s found in titles but not in anime-list.xml", aid) + return guess + + logger.info( + "Matched '%s' to AniDB id %s (tvdb=%s, tmdbid=%s)", + guess.title, + aid, + anime.tvdbid, + anime.tmdbid, + ) + + new_external_id = dict(guess.external_id) + new_external_id[ProviderName.ANIDB] = aid + if anime.tvdbid: + new_external_id[ProviderName.TVDB] = anime.tvdbid + if anime.tmdbid: + new_external_id[ProviderName.TMDB] = anime.tmdbid + if anime.imdbid: + new_external_id[ProviderName.IMDB] = anime.imdbid + + new_episodes: list[Guess.Episode] = [] + for ep in guess.episodes: + # TODO: implement this + ... + + return Guess( + title=guess.title, + kind=guess.kind, + extra_kind=guess.extra_kind, + years=guess.years, + episodes=new_episodes, + external_id=new_external_id, + raw=guess.raw, + from_="anilist", + history=[*guess.history, guess], + ) diff --git a/scanner/scanner/identifiers/identify.py b/scanner/scanner/identifiers/identify.py index 0c3187f4..4d2790d6 100644 --- a/scanner/scanner/identifiers/identify.py +++ b/scanner/scanner/identifiers/identify.py @@ -7,14 +7,15 @@ from typing import Callable, Literal, cast from rebulk.match import Match from ..models.videos import Guess, Video +from .anilist import anilist from .guess.guess import guessit logger = getLogger(__name__) pipeline: list[Callable[[str, Guess], Awaitable[Guess]]] = [ + anilist, # TODO: add nfo scanner # TODO: add thexem - # TODO: add anilist ] diff --git a/scanner/scanner/models/videos.py b/scanner/scanner/models/videos.py index de724b8d..a128a0fb 100644 --- a/scanner/scanner/models/videos.py +++ b/scanner/scanner/models/videos.py @@ -33,6 +33,7 @@ class Guess(Model, extra="allow"): class Episode(Model): season: int | None episode: int + external_id: dict[str, MetadataId | EpisodeId] = {} @override def __hash__(self) -> int: diff --git a/scanner/scanner/providers/names.py b/scanner/scanner/providers/names.py index 704ccf27..a50cda53 100644 --- a/scanner/scanner/providers/names.py +++ b/scanner/scanner/providers/names.py @@ -2,3 +2,4 @@ class ProviderName: TMDB = "themoviedatabase" TVDB = "tvdb" IMDB = "imdb" + ANIDB = "anidb" diff --git a/scanner/uv.lock b/scanner/uv.lock index 47446e7e..92784083 100644 --- a/scanner/uv.lock +++ b/scanner/uv.lock @@ -1259,6 +1259,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, ] +[[package]] +name = "pydantic-xml" +version = "2.19.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "pydantic-core" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b8/cb/5f80b61d73a8d6171ee4611bfd2b944c036c6f6e5f6e01d9fb02f29d7bfc/pydantic_xml-2.19.0.tar.gz", hash = "sha256:b7acba5a0966cbbbc9bf88d0d870b2bc875da063fe1bbe62d83939b549224730", size = 26228, upload-time = "2026-02-14T17:33:53.368Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/2d/dce0dc471fade04829c2948462d79c9bc4991305b0f73889f70c9645e540/pydantic_xml-2.19.0-py3-none-any.whl", hash = "sha256:42854bf962758bec338c112c2de984723708262793e108416f33aa4d6c11b3b4", size = 42536, upload-time = "2026-02-14T17:33:54.206Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -1482,6 +1495,7 @@ dependencies = [ { name = "opentelemetry-instrumentation-fastapi" }, { name = "opentelemetry-sdk" }, { name = "pydantic" }, + { name = "pydantic-xml" }, { name = "pyjwt", extra = ["crypto"] }, { name = "python-slugify" }, { name = "watchfiles" }, @@ -1502,6 +1516,7 @@ requires-dist = [ { name = "opentelemetry-instrumentation-fastapi", specifier = ">=0.59b0" }, { name = "opentelemetry-sdk", specifier = ">=1.38.0" }, { name = "pydantic", specifier = ">=2.11.4" }, + { name = "pydantic-xml", specifier = ">=2.14.0" }, { name = "pyjwt", extras = ["crypto"], specifier = ">=2.10.1" }, { name = "python-slugify", specifier = ">=8.0.4" }, { name = "watchfiles", specifier = ">=1.0.5" },