Parse animelist.xml

This commit is contained in:
Zoe Roux 2026-03-15 12:38:15 +01:00
parent 1d0c8a81ed
commit 3202c01767
No known key found for this signature in database
6 changed files with 202 additions and 1 deletions

View File

@ -18,6 +18,7 @@ dependencies = [
"opentelemetry-instrumentation-fastapi>=0.59b0",
"opentelemetry-sdk>=1.38.0",
"pydantic>=2.11.4",
"pydantic-xml>=2.14.0",
"pyjwt[crypto]>=2.10.1",
"python-slugify>=8.0.4",
"watchfiles>=1.0.5",

View File

@ -0,0 +1,182 @@
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass
from datetime import datetime, timedelta
from functools import cached_property
from logging import getLogger
from typing import Literal
from aiohttp import ClientSession
from pydantic import field_validator
from pydantic_xml import BaseXmlModel, attr, element
from ..cache import cache
from ..models.metadataid import EpisodeId, MetadataId
from ..models.videos import Guess
from ..providers.names import ProviderName
logger = getLogger(__name__)
class AnimeTitlesDb(BaseXmlModel, tag="animetitles"):
animes: list[AnimeTitlesEntry] = element(default=[])
@classmethod
def get_url(cls):
return "https://raw.githubusercontent.com/Anime-Lists/anime-lists/master/animetitles.xml"
class AnimeTitlesEntry(BaseXmlModel, tag="anime"):
aid: str = attr()
titles: list[AnimeTitle] = element(default=[])
class AnimeTitle(
BaseXmlModel,
tag="title",
nsmap={"xml": "http://www.w3.org/XML/1998/namespace"},
):
type: str = attr()
lang: str = attr(ns="xml")
text: str
class AnimeListDb(BaseXmlModel, tag="anime-list"):
animes: list[AnimeEntry] = element(default=[])
@classmethod
def get_url(cls):
return "https://raw.githubusercontent.com/Anime-Lists/anime-lists/refs/heads/master/anime-list.xml"
class AnimeEntry(BaseXmlModel, tag="anime"):
anidbid: str = attr()
tvdbid: str | None = attr(default=None)
defaulttvdbseason: int | Literal["a"] | None = attr(default=None)
episodeoffset: int = attr(default=0)
tmdbid: str | None = attr(default=None)
imdbid: str | None = attr(default=None)
name: str | None = element(default=None)
mapping_list: MappingList | None = element(default=[])
@field_validator("tmdbid", "imdbid")
@classmethod
def _empty_to_none(cls, v: str | None) -> str | None:
return v or None
class MappingList(BaseXmlModel, tag="mapping-list"):
mappings: list[EpisodeMapping] = element(default=[])
class EpisodeMapping(BaseXmlModel):
anidbseason: int = attr()
tvdbseason: int | None = attr(default=None)
start: int | None = attr(default=None)
end: int | None = attr(default=None)
offset: int = attr(default=0)
text: str | None = None
@cached_property
def tvdb_mappings(self) -> dict[int, list[int]]:
if self.tvdbseason is None or not self.text:
return {}
ret = {}
for map in self.text.split(";"):
map = map.strip()
if not map or "-" not in map:
continue
[aid, tvdbids] = map.split("-", 1)
try:
ret[int(aid.strip())] = [
int(x.strip()) for x in tvdbids.split("+")
]
except ValueError:
continue
return ret
@dataclass
class AnimeListData:
fetched_at: datetime
# normalized title -> anidbid
titles: dict[str, str] = {}
# anidbid -> AnimeEntry
animes: dict[str, AnimeListDb.AnimeEntry] = {}
@cache(ttl=timedelta(days=30))
async def get_data() -> AnimeListData:
logger.info("Fetching anime-lists XML databases...")
ret = AnimeListData(fetched_at=datetime.now())
async with ClientSession() as session:
async with session.get(AnimeTitlesDb.get_url()) as resp:
resp.raise_for_status()
titles = AnimeTitlesDb.from_xml(await resp.read())
ret.titles = {
normalize_title(title.text): x.aid
for x in titles.animes
for title in x.titles
}
async with session.get(AnimeListDb.get_url()) as resp:
resp.raise_for_status()
db = AnimeListDb.from_xml(await resp.read())
ret.animes = {entry.anidbid: entry for entry in db.animes}
logger.info(
"Loaded %d anime titles from animelist-xml.",
len(ret.titles),
)
return ret
def normalize_title(title: str) -> str:
title = unicodedata.normalize("NFD", title)
title = "".join(c for c in title if unicodedata.category(c) != "Mn")
title = title.lower()
title = re.sub(r"[^\w\s]", "", title)
title = re.sub(r"\s+", " ", title).strip()
return title
async def anilist(_path: str, guess: Guess) -> Guess:
data = await get_data()
aid = data.titles.get(guess.title)
if aid is None:
return guess
anime = data.animes.get(aid)
if anime is None:
logger.warning("AniDB id %s found in titles but not in anime-list.xml", aid)
return guess
logger.info(
"Matched '%s' to AniDB id %s (tvdb=%s, tmdbid=%s)",
guess.title,
aid,
anime.tvdbid,
anime.tmdbid,
)
new_external_id = dict(guess.external_id)
new_external_id[ProviderName.ANIDB] = aid
if anime.tvdbid:
new_external_id[ProviderName.TVDB] = anime.tvdbid
if anime.tmdbid:
new_external_id[ProviderName.TMDB] = anime.tmdbid
if anime.imdbid:
new_external_id[ProviderName.IMDB] = anime.imdbid
new_episodes: list[Guess.Episode] = []
for ep in guess.episodes:
# TODO: implement this
...
return Guess(
title=guess.title,
kind=guess.kind,
extra_kind=guess.extra_kind,
years=guess.years,
episodes=new_episodes,
external_id=new_external_id,
raw=guess.raw,
from_="anilist",
history=[*guess.history, guess],
)

View File

@ -7,14 +7,15 @@ from typing import Callable, Literal, cast
from rebulk.match import Match
from ..models.videos import Guess, Video
from .anilist import anilist
from .guess.guess import guessit
logger = getLogger(__name__)
pipeline: list[Callable[[str, Guess], Awaitable[Guess]]] = [
anilist,
# TODO: add nfo scanner
# TODO: add thexem
# TODO: add anilist
]

View File

@ -33,6 +33,7 @@ class Guess(Model, extra="allow"):
class Episode(Model):
season: int | None
episode: int
external_id: dict[str, MetadataId | EpisodeId] = {}
@override
def __hash__(self) -> int:

View File

@ -2,3 +2,4 @@ class ProviderName:
TMDB = "themoviedatabase"
TVDB = "tvdb"
IMDB = "imdb"
ANIDB = "anidb"

15
scanner/uv.lock generated
View File

@ -1259,6 +1259,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" },
]
[[package]]
name = "pydantic-xml"
version = "2.19.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pydantic" },
{ name = "pydantic-core" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b8/cb/5f80b61d73a8d6171ee4611bfd2b944c036c6f6e5f6e01d9fb02f29d7bfc/pydantic_xml-2.19.0.tar.gz", hash = "sha256:b7acba5a0966cbbbc9bf88d0d870b2bc875da063fe1bbe62d83939b549224730", size = 26228, upload-time = "2026-02-14T17:33:53.368Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/27/2d/dce0dc471fade04829c2948462d79c9bc4991305b0f73889f70c9645e540/pydantic_xml-2.19.0-py3-none-any.whl", hash = "sha256:42854bf962758bec338c112c2de984723708262793e108416f33aa4d6c11b3b4", size = 42536, upload-time = "2026-02-14T17:33:54.206Z" },
]
[[package]]
name = "pygments"
version = "2.19.2"
@ -1482,6 +1495,7 @@ dependencies = [
{ name = "opentelemetry-instrumentation-fastapi" },
{ name = "opentelemetry-sdk" },
{ name = "pydantic" },
{ name = "pydantic-xml" },
{ name = "pyjwt", extra = ["crypto"] },
{ name = "python-slugify" },
{ name = "watchfiles" },
@ -1502,6 +1516,7 @@ requires-dist = [
{ name = "opentelemetry-instrumentation-fastapi", specifier = ">=0.59b0" },
{ name = "opentelemetry-sdk", specifier = ">=1.38.0" },
{ name = "pydantic", specifier = ">=2.11.4" },
{ name = "pydantic-xml", specifier = ">=2.14.0" },
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.10.1" },
{ name = "python-slugify", specifier = ">=8.0.4" },
{ name = "watchfiles", specifier = ">=1.0.5" },