From 3202c01767ccc141b8b7be3a4e97da23982afedf Mon Sep 17 00:00:00 2001
From: Zoe Roux <zoe.roux@zoriya.dev>
Date: Sun, 15 Mar 2026 12:38:15 +0100
Subject: [PATCH] Parse animelist.xml

---
 scanner/pyproject.toml                  |   1 +
 scanner/scanner/identifiers/anilist.py  | 182 ++++++++++++++++++++++++
 scanner/scanner/identifiers/identify.py |   3 +-
 scanner/scanner/models/videos.py        |   1 +
 scanner/scanner/providers/names.py      |   1 +
 scanner/uv.lock                         |  15 ++
 6 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 scanner/scanner/identifiers/anilist.py

diff --git a/scanner/pyproject.toml b/scanner/pyproject.toml
index 2964bd29..5b9fcf31 100644
--- a/scanner/pyproject.toml
+++ b/scanner/pyproject.toml
@@ -18,6 +18,7 @@ dependencies = [
     "opentelemetry-instrumentation-fastapi>=0.59b0",
     "opentelemetry-sdk>=1.38.0",
     "pydantic>=2.11.4",
+    "pydantic-xml>=2.14.0",
     "pyjwt[crypto]>=2.10.1",
     "python-slugify>=8.0.4",
     "watchfiles>=1.0.5",
diff --git a/scanner/scanner/identifiers/anilist.py b/scanner/scanner/identifiers/anilist.py
new file mode 100644
index 00000000..1a426564
--- /dev/null
+++ b/scanner/scanner/identifiers/anilist.py
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import re
+import unicodedata
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from functools import cached_property
+from logging import getLogger
+from typing import Literal
+
+from aiohttp import ClientSession
+from pydantic import field_validator
+from pydantic_xml import BaseXmlModel, attr, element
+
+from ..cache import cache
+from ..models.metadataid import EpisodeId, MetadataId
+from ..models.videos import Guess
+from ..providers.names import ProviderName
+
+logger = getLogger(__name__)
+
+
+class AnimeTitlesDb(BaseXmlModel, tag="animetitles"):
+	animes: list[AnimeTitlesEntry] = element(default=[])
+
+	@classmethod
+	def get_url(cls):
+		return "https://raw.githubusercontent.com/Anime-Lists/anime-lists/master/animetitles.xml"
+
+	class AnimeTitlesEntry(BaseXmlModel, tag="anime"):
+		aid: str = attr()
+		titles: list[AnimeTitle] = element(default=[])
+
+		class AnimeTitle(
+			BaseXmlModel,
+			tag="title",
+			nsmap={"xml": "http://www.w3.org/XML/1998/namespace"},
+		):
+			type: str = attr()
+			lang: str = attr(ns="xml")
+			text: str
+
+
+class AnimeListDb(BaseXmlModel, tag="anime-list"):
+	animes: list[AnimeEntry] = element(default=[])
+
+	@classmethod
+	def get_url(cls):
+		return "https://raw.githubusercontent.com/Anime-Lists/anime-lists/refs/heads/master/anime-list.xml"
+
+	class AnimeEntry(BaseXmlModel, tag="anime"):
+		anidbid: str = attr()
+		tvdbid: str | None = attr(default=None)
+		defaulttvdbseason: int | Literal["a"] | None = attr(default=None)
+		episodeoffset: int = attr(default=0)
+		tmdbid: str | None = attr(default=None)
+		imdbid: str | None = attr(default=None)
+		name: str | None = element(default=None)
+		mapping_list: MappingList | None = element(default=[])
+
+		@field_validator("tmdbid", "imdbid")
+		@classmethod
+		def _empty_to_none(cls, v: str | None) -> str | None:
+			return v or None
+
+		class MappingList(BaseXmlModel, tag="mapping-list"):
+			mappings: list[EpisodeMapping] = element(default=[])
+
+			class EpisodeMapping(BaseXmlModel):
+				anidbseason: int = attr()
+				tvdbseason: int | None = attr(default=None)
+				start: int | None = attr(default=None)
+				end: int | None = attr(default=None)
+				offset: int = attr(default=0)
+				text: str | None = None
+
+				@cached_property
+				def tvdb_mappings(self) -> dict[int, list[int]]:
+					if self.tvdbseason is None or not self.text:
+						return {}
+					ret = {}
+					for map in self.text.split(";"):
+						map = map.strip()
+						if not map or "-" not in map:
+							continue
+						[aid, tvdbids] = map.split("-", 1)
+						try:
+							ret[int(aid.strip())] = [
+								int(x.strip()) for x in tvdbids.split("+")
+							]
+						except ValueError:
+							continue
+					return ret
+
+
+@dataclass
+class AnimeListData:
+	fetched_at: datetime
+	# normalized title -> anidbid
+	titles: dict[str, str] = {}
+	# anidbid -> AnimeEntry
+	animes: dict[str, AnimeListDb.AnimeEntry] = {}
+
+
+@cache(ttl=timedelta(days=30))
+async def get_data() -> AnimeListData:
+	logger.info("Fetching anime-lists XML databases...")
+	ret = AnimeListData(fetched_at=datetime.now())
+	async with ClientSession() as session:
+		async with session.get(AnimeTitlesDb.get_url()) as resp:
+			resp.raise_for_status()
+			titles = AnimeTitlesDb.from_xml(await resp.read())
+			ret.titles = {
+				normalize_title(title.text): x.aid
+				for x in titles.animes
+				for title in x.titles
+			}
+		async with session.get(AnimeListDb.get_url()) as resp:
+			resp.raise_for_status()
+			db = AnimeListDb.from_xml(await resp.read())
+			ret.animes = {entry.anidbid: entry for entry in db.animes}
+
+	logger.info(
+		"Loaded %d anime titles from animelist-xml.",
+		len(ret.titles),
+	)
+	return ret
+
+
+def normalize_title(title: str) -> str:
+	title = unicodedata.normalize("NFD", title)
+	title = "".join(c for c in title if unicodedata.category(c) != "Mn")
+	title = title.lower()
+	title = re.sub(r"[^\w\s]", "", title)
+	title = re.sub(r"\s+", " ", title).strip()
+	return title
+
+
+async def anilist(_path: str, guess: Guess) -> Guess:
+	data = await get_data()
+
+	aid = data.titles.get(guess.title)
+	if aid is None:
+		return guess
+	anime = data.animes.get(aid)
+	if anime is None:
+		logger.warning("AniDB id %s found in titles but not in anime-list.xml", aid)
+		return guess
+
+	logger.info(
+		"Matched '%s' to AniDB id %s (tvdb=%s, tmdbid=%s)",
+		guess.title,
+		aid,
+		anime.tvdbid,
+		anime.tmdbid,
+	)
+
+	new_external_id = dict(guess.external_id)
+	new_external_id[ProviderName.ANIDB] = aid
+	if anime.tvdbid:
+		new_external_id[ProviderName.TVDB] = anime.tvdbid
+	if anime.tmdbid:
+		new_external_id[ProviderName.TMDB] = anime.tmdbid
+	if anime.imdbid:
+		new_external_id[ProviderName.IMDB] = anime.imdbid
+
+	new_episodes: list[Guess.Episode] = []
+	for ep in guess.episodes:
+		# TODO: implement this
+		...
+
+	return Guess(
+		title=guess.title,
+		kind=guess.kind,
+		extra_kind=guess.extra_kind,
+		years=guess.years,
+		episodes=new_episodes,
+		external_id=new_external_id,
+		raw=guess.raw,
+		from_="anilist",
+		history=[*guess.history, guess],
+	)
diff --git a/scanner/scanner/identifiers/identify.py b/scanner/scanner/identifiers/identify.py
index 0c3187f4..4d2790d6 100644
--- a/scanner/scanner/identifiers/identify.py
+++ b/scanner/scanner/identifiers/identify.py
@@ -7,14 +7,15 @@ from typing import Callable, Literal, cast
 from rebulk.match import Match
 
 from ..models.videos import Guess, Video
+from .anilist import anilist
 from .guess.guess import guessit
 
 logger = getLogger(__name__)
 
 pipeline: list[Callable[[str, Guess], Awaitable[Guess]]] = [
+	anilist,
 	# TODO: add nfo scanner
 	# TODO: add thexem
-	# TODO: add anilist
 ]
 
 
diff --git a/scanner/scanner/models/videos.py b/scanner/scanner/models/videos.py
index de724b8d..a128a0fb 100644
--- a/scanner/scanner/models/videos.py
+++ b/scanner/scanner/models/videos.py
@@ -33,6 +33,7 @@ class Guess(Model, extra="allow"):
 	class Episode(Model):
 		season: int | None
 		episode: int
+		external_id: dict[str, MetadataId | EpisodeId] = {}
 
 		@override
 		def __hash__(self) -> int:
diff --git a/scanner/scanner/providers/names.py b/scanner/scanner/providers/names.py
index 704ccf27..a50cda53 100644
--- a/scanner/scanner/providers/names.py
+++ b/scanner/scanner/providers/names.py
@@ -2,3 +2,4 @@ class ProviderName:
 	TMDB = "themoviedatabase"
 	TVDB = "tvdb"
 	IMDB = "imdb"
+	ANIDB = "anidb"
diff --git a/scanner/uv.lock b/scanner/uv.lock
index 47446e7e..92784083 100644
--- a/scanner/uv.lock
+++ b/scanner/uv.lock
@@ -1259,6 +1259,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" },
 ]
 
+[[package]]
+name = "pydantic-xml"
+version = "2.19.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "pydantic-core" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b8/cb/5f80b61d73a8d6171ee4611bfd2b944c036c6f6e5f6e01d9fb02f29d7bfc/pydantic_xml-2.19.0.tar.gz", hash = "sha256:b7acba5a0966cbbbc9bf88d0d870b2bc875da063fe1bbe62d83939b549224730", size = 26228, upload-time = "2026-02-14T17:33:53.368Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/2d/dce0dc471fade04829c2948462d79c9bc4991305b0f73889f70c9645e540/pydantic_xml-2.19.0-py3-none-any.whl", hash = "sha256:42854bf962758bec338c112c2de984723708262793e108416f33aa4d6c11b3b4", size = 42536, upload-time = "2026-02-14T17:33:54.206Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -1482,6 +1495,7 @@ dependencies = [
     { name = "opentelemetry-instrumentation-fastapi" },
     { name = "opentelemetry-sdk" },
     { name = "pydantic" },
+    { name = "pydantic-xml" },
     { name = "pyjwt", extra = ["crypto"] },
     { name = "python-slugify" },
     { name = "watchfiles" },
@@ -1502,6 +1516,7 @@ requires-dist = [
     { name = "opentelemetry-instrumentation-fastapi", specifier = ">=0.59b0" },
     { name = "opentelemetry-sdk", specifier = ">=1.38.0" },
     { name = "pydantic", specifier = ">=2.11.4" },
+    { name = "pydantic-xml", specifier = ">=2.14.0" },
     { name = "pyjwt", extras = ["crypto"], specifier = ">=2.10.1" },
     { name = "python-slugify", specifier = ">=8.0.4" },
     { name = "watchfiles", specifier = ">=1.0.5" },