searxng/searx/engines/openalex.py
muthukumaran R a0ff173799
[feat] engines: add OpenAlex Works engine (#5102)
- Adds a new engine `searx/engines/openalex.py` that integrates the OpenAlex
  Works API to return scientific paper results using the `paper.html` template.
- Uses the official API (no auth required); supports OpenAlex polite pool via `mailto`.
- Adds developer docs at `docs/dev/engines/online/openalex.rst`.

OpenAlex API reference: https://docs.openalex.org/how-to-use-the-api/api-overview
2025-08-24 14:17:30 +02:00

206 lines
6.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
#
# Engine is documented in: docs/dev/engines/online/openalex.rst
from __future__ import annotations
import typing as t
from datetime import datetime
from urllib.parse import urlencode
from searx.result_types import EngineResults
from searx.extended_types import SXNG_Response
# about
about = {
"website": "https://openalex.org/",
"wikidata_id": "Q110718454",
"official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
# engine dependent config
categories = ["science", "scientific publications"]
paging = True
search_url = "https://api.openalex.org/works"
# Optional: include your email for OpenAlex polite pool. Can be set from settings.yml
# engines: - name: openalex; engine: openalex; mailto: "[email protected]"
mailto = ""
def request(query: str, params: dict[str, t.Any]) -> None:
# Build OpenAlex query using search parameter and paging
args = {
"search": query,
"page": params["pageno"],
# keep result size moderate; OpenAlex default is 25
"per-page": 10,
# relevance sorting works only with `search`
"sort": "relevance_score:desc",
}
# Language filter (expects ISO639-1 like 'fr', 'en')
language = params.get("language")
filters: list[str] = []
if isinstance(language, str) and language != "all":
iso2 = language.split("-")[0].split("_")[0]
if len(iso2) == 2:
filters.append(f"language:{iso2}")
if filters:
args["filter"] = ",".join(filters)
# include mailto if configured for polite pool (engine module setting)
if isinstance(mailto, str) and mailto != "":
args["mailto"] = mailto
params["url"] = f"{search_url}?{urlencode(args)}"
def response(resp: SXNG_Response) -> EngineResults:
data = resp.json()
res = EngineResults()
for item in data.get("results", []):
url, html_url, pdf_url = _extract_links(item)
title: str = item.get("title", "")
content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or ""
authors = _extract_authors(item)
journal, publisher, pages, volume, number, published_date = _extract_biblio(item)
doi = _doi_to_plain(item.get("doi"))
tags = _extract_tags(item) or None
comments = _extract_comments(item)
res.add(
res.types.LegacyResult(
template="paper.html",
url=url,
title=title,
content=content,
journal=journal,
publisher=publisher,
doi=doi,
tags=tags,
authors=authors,
pdf_url=pdf_url,
html_url=html_url,
publishedDate=published_date,
pages=pages,
volume=volume,
number=number,
type=item.get("type"),
comments=comments,
)
)
return res
def _stringify_pages(biblio: dict[str, t.Any]) -> str | None:
first_page = biblio.get("first_page")
last_page = biblio.get("last_page")
if first_page and last_page:
return f"{first_page}-{last_page}"
if first_page:
return str(first_page)
if last_page:
return str(last_page)
return None
def _parse_date(value: str | None) -> datetime | None:
if not value:
return None
# OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD
for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
try:
return datetime.strptime(value, fmt)
except ValueError:
continue
return None
def _doi_to_plain(doi_value: str | None) -> str | None:
if not doi_value:
return None
# OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd
return doi_value.removeprefix("https://doi.org/")
def _reconstruct_abstract(
abstract_inverted_index: dict[str, list[int]] | None,
) -> str | None:
# The abstract is returned as an inverted index {token: [positions...]}
# Reconstruct by placing tokens at their positions and joining with spaces.
if not abstract_inverted_index:
return None
position_to_token: dict[int, str] = {}
max_index = -1
for token, positions in abstract_inverted_index.items():
for pos in positions:
position_to_token[pos] = token
max_index = max(max_index, pos)
if max_index < 0:
return None
ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)]
# collapse multiple empty tokens
text = " ".join(t for t in ordered_tokens if t != "")
return text if text != "" else None
def _extract_links(item: dict[str, t.Any]) -> tuple[str, str | None, str | None]:
primary_location = item.get("primary_location", {})
landing_page_url: str | None = primary_location.get("landing_page_url")
work_url: str = item.get("id", "")
url: str = landing_page_url or work_url
open_access = item.get("open_access", {})
pdf_url: str | None = primary_location.get("pdf_url") or open_access.get("oa_url")
html_url: str | None = landing_page_url
return url, html_url, pdf_url
def _extract_authors(item: dict[str, t.Any]) -> list[str]:
authors: list[str] = []
for auth in item.get("authorships", []):
if not auth:
continue
author_obj = auth.get("author", {})
display_name = author_obj.get("display_name")
if isinstance(display_name, str) and display_name != "":
authors.append(display_name)
return authors
def _extract_tags(item: dict[str, t.Any]) -> list[str]:
tags: list[str] = []
for c in item.get("concepts", []):
name = (c or {}).get("display_name")
if isinstance(name, str) and name != "":
tags.append(name)
return tags
def _extract_biblio(
item: dict[str, t.Any],
) -> tuple[str | None, str | None, str | None, str | None, str | None, datetime | None]:
host_venue = item.get("host_venue", {})
biblio = item.get("biblio", {})
journal: str | None = host_venue.get("display_name")
publisher: str | None = host_venue.get("publisher")
pages = _stringify_pages(biblio)
volume = biblio.get("volume")
number = biblio.get("issue")
published_date = _parse_date(item.get("publication_date"))
return journal, publisher, pages, volume, number, published_date
def _extract_comments(item: dict[str, t.Any]) -> str | None:
cited_by_count = item.get("cited_by_count")
if isinstance(cited_by_count, int):
return f"{cited_by_count} citations"
return None