mirror of
https://github.com/searxng/searxng.git
synced 2025-10-19 13:00:36 -04:00
Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
297 lines
9.7 KiB
Python
297 lines
9.7 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||
"""The OpenAlex engine integrates the `OpenAlex`_ Works API to return scientific
|
||
paper results using the :ref:`result_types.paper` class. It is an "online" JSON
|
||
engine that uses the official public API and does not require an API key.
|
||
|
||
.. _OpenAlex: https://openalex.org
|
||
.. _OpenAlex API overview: https://docs.openalex.org/how-to-use-the-api/api-overview
|
||
|
||
Key features
|
||
------------
|
||
|
||
- Uses the official Works endpoint (JSON)
|
||
- Paging support via ``page`` and ``per-page``
|
||
- Relevance sorting (``sort=relevance_score:desc``)
|
||
- Language filter support (maps SearXNG language to ``filter=language:<iso2>``)
|
||
- Maps fields commonly used in scholarly results: title, authors, abstract
|
||
(reconstructed from inverted index), journal/venue, publisher, DOI, tags
|
||
(concepts), PDF/HTML links, pages, volume, issue, published date, and a short
|
||
citations comment
|
||
- Supports OpenAlex "polite pool" by adding a ``mailto`` parameter
|
||
|
||
|
||
Configuration
|
||
=============
|
||
|
||
Minimal example for :origin:`settings.yml <searx/settings.yml>`:
|
||
|
||
.. code:: yaml
|
||
|
||
- name: openalex
|
||
engine: openalex
|
||
shortcut: oa
|
||
categories: science, scientific publications
|
||
timeout: 5.0
|
||
# Recommended by OpenAlex: join the polite pool with an email address
|
||
mailto: "[email protected]"
|
||
|
||
Notes
|
||
-----
|
||
|
||
- The ``mailto`` key is optional but recommended by OpenAlex for better service.
|
||
- Language is inherited from the user's UI language; when it is not ``all``, the
|
||
engine adds ``filter=language:<iso2>`` (e.g. ``language:fr``). If OpenAlex has
|
||
few results for that language, you may see fewer items.
|
||
- Results typically include a main link. When the primary landing page from
|
||
OpenAlex is a DOI resolver, the engine will use that stable link. When an open
|
||
access link is available, it is exposed via the ``PDF`` and/or ``HTML`` links
|
||
in the result footer.
|
||
|
||
|
||
What is returned
|
||
================
|
||
|
||
Each result uses the :ref:`result_types.paper` class and may include:
|
||
|
||
- ``title`` and ``content`` (abstract; reconstructed from the inverted index)
|
||
- ``authors`` (display names)
|
||
- ``journal`` (host venue display name) and ``publisher``
|
||
- ``doi`` (normalized to the plain DOI, without the ``https://doi.org/`` prefix)
|
||
- ``tags`` (OpenAlex concepts display names)
|
||
- ``pdf_url`` (Open access PDF if available) and ``html_url`` (landing page)
|
||
- ``publishedDate`` (parsed from ``publication_date``)
|
||
- ``pages``, ``volume``, ``number`` (issue)
|
||
- ``type`` and a brief ``comments`` string with citation count
|
||
|
||
|
||
Rate limits & polite pool
|
||
=========================
|
||
|
||
OpenAlex offers a free public API with generous daily limits. For extra courtesy
|
||
and improved service quality, include a contact email in each request via
|
||
``mailto``. You can set it directly in the engine configuration as shown above.
|
||
See: `OpenAlex API overview`_.
|
||
|
||
|
||
Troubleshooting
|
||
===============
|
||
|
||
- Few or no results in a non-English UI language:
|
||
Ensure the selected language has sufficient coverage at OpenAlex, or set the
|
||
UI language to English and retry.
|
||
- Preference changes fail while testing locally:
|
||
Make sure your ``server.secret_key`` and ``server.base_url`` are set in your
|
||
instance settings so signed cookies work; see :ref:`settings server`.
|
||
|
||
|
||
Implementation
|
||
===============
|
||
|
||
"""
|
||
|
||
import typing as t
|
||
|
||
from datetime import datetime
|
||
from urllib.parse import urlencode
|
||
from searx.result_types import EngineResults
|
||
|
||
if t.TYPE_CHECKING:
|
||
from searx.extended_types import SXNG_Response
|
||
from searx.search.processors import OnlineParams
|
||
|
||
# about
|
||
about = {
|
||
"website": "https://openalex.org/",
|
||
"wikidata_id": "Q110718454",
|
||
"official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview",
|
||
"use_official_api": True,
|
||
"require_api_key": False,
|
||
"results": "JSON",
|
||
}
|
||
|
||
|
||
# engine dependent config
|
||
categories = ["science", "scientific publications"]
|
||
paging = True
|
||
search_url = "https://api.openalex.org/works"
|
||
|
||
# Optional: include your email for OpenAlex polite pool. Can be set from settings.yml
|
||
# engines: - name: openalex; engine: openalex; mailto: "[email protected]"
|
||
mailto = ""
|
||
|
||
|
||
def request(query: str, params: "OnlineParams") -> None:
|
||
# Build OpenAlex query using search parameter and paging
|
||
args = {
|
||
"search": query,
|
||
"page": params["pageno"],
|
||
# keep result size moderate; OpenAlex default is 25
|
||
"per-page": 10,
|
||
# relevance sorting works only with `search`
|
||
"sort": "relevance_score:desc",
|
||
}
|
||
|
||
# Language filter (expects ISO639-1 like 'fr', 'en')
|
||
language = params.get("language")
|
||
filters: list[str] = []
|
||
if isinstance(language, str) and language != "all":
|
||
iso2 = language.split("-")[0].split("_")[0]
|
||
if len(iso2) == 2:
|
||
filters.append(f"language:{iso2}")
|
||
|
||
if filters:
|
||
args["filter"] = ",".join(filters)
|
||
|
||
# include mailto if configured for polite pool (engine module setting)
|
||
if isinstance(mailto, str) and mailto != "":
|
||
args["mailto"] = mailto
|
||
|
||
params["url"] = f"{search_url}?{urlencode(args)}"
|
||
|
||
|
||
def response(resp: "SXNG_Response") -> EngineResults:
|
||
data = resp.json()
|
||
res = EngineResults()
|
||
|
||
for item in data.get("results", []):
|
||
url, html_url, pdf_url = _extract_links(item)
|
||
title: str = item.get("title", "")
|
||
content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or ""
|
||
authors = _extract_authors(item)
|
||
journal, publisher, pages, volume, number, published_date = _extract_biblio(item)
|
||
doi = _doi_to_plain(item.get("doi"))
|
||
tags = _extract_tags(item)
|
||
comments = _extract_comments(item)
|
||
|
||
res.add(
|
||
res.types.Paper(
|
||
url=url,
|
||
title=title,
|
||
content=content,
|
||
journal=journal,
|
||
publisher=publisher,
|
||
doi=doi,
|
||
tags=tags,
|
||
authors=authors,
|
||
pdf_url=pdf_url,
|
||
html_url=html_url,
|
||
publishedDate=published_date,
|
||
pages=pages,
|
||
volume=volume,
|
||
number=number,
|
||
type=item.get("type"),
|
||
comments=comments,
|
||
)
|
||
)
|
||
|
||
return res
|
||
|
||
|
||
def _stringify_pages(biblio: dict[str, t.Any]) -> str:
|
||
first_page = biblio.get("first_page")
|
||
last_page = biblio.get("last_page")
|
||
if first_page and last_page:
|
||
return f"{first_page}-{last_page}"
|
||
if first_page:
|
||
return str(first_page)
|
||
if last_page:
|
||
return str(last_page)
|
||
return ""
|
||
|
||
|
||
def _parse_date(value: str | None) -> datetime | None:
|
||
if not value:
|
||
return None
|
||
# OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD
|
||
for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
|
||
try:
|
||
return datetime.strptime(value, fmt)
|
||
except ValueError:
|
||
continue
|
||
return None
|
||
|
||
|
||
def _doi_to_plain(doi_value: str | None) -> str:
|
||
if not doi_value:
|
||
return ""
|
||
# OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd
|
||
return doi_value.removeprefix("https://doi.org/")
|
||
|
||
|
||
def _reconstruct_abstract(
|
||
abstract_inverted_index: dict[str, list[int]] | None,
|
||
) -> str | None:
|
||
# The abstract is returned as an inverted index {token: [positions...]}
|
||
# Reconstruct by placing tokens at their positions and joining with spaces.
|
||
if not abstract_inverted_index:
|
||
return None
|
||
position_to_token: dict[int, str] = {}
|
||
max_index = -1
|
||
for token, positions in abstract_inverted_index.items():
|
||
for pos in positions:
|
||
position_to_token[pos] = token
|
||
max_index = max(max_index, pos)
|
||
if max_index < 0:
|
||
return None
|
||
ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)]
|
||
# collapse multiple empty tokens
|
||
text = " ".join(t for t in ordered_tokens if t != "")
|
||
return text if text != "" else None
|
||
|
||
|
||
def _extract_links(item: dict[str, t.Any]) -> tuple[str, str, str]:
|
||
primary_location: dict[str, str] = item.get("primary_location", {})
|
||
open_access: dict[str, str] = item.get("open_access", {})
|
||
|
||
landing_page_url: str = primary_location.get("landing_page_url") or ""
|
||
work_url: str = item.get("id", "")
|
||
|
||
url: str = landing_page_url or work_url
|
||
html_url: str = landing_page_url
|
||
pdf_url: str = primary_location.get("pdf_url") or open_access.get("oa_url") or ""
|
||
|
||
return url, html_url, pdf_url
|
||
|
||
|
||
def _extract_authors(item: dict[str, t.Any]) -> list[str]:
|
||
authors: list[str] = []
|
||
for auth in item.get("authorships", []):
|
||
if not auth:
|
||
continue
|
||
author_obj = auth.get("author", {})
|
||
display_name = author_obj.get("display_name")
|
||
if isinstance(display_name, str) and display_name != "":
|
||
authors.append(display_name)
|
||
return authors
|
||
|
||
|
||
def _extract_tags(item: dict[str, t.Any]) -> list[str]:
|
||
tags: list[str] = []
|
||
for c in item.get("concepts", []):
|
||
name = (c or {}).get("display_name")
|
||
if isinstance(name, str) and name != "":
|
||
tags.append(name)
|
||
return tags
|
||
|
||
|
||
def _extract_biblio(
|
||
item: dict[str, t.Any],
|
||
) -> tuple[str, str, str, str, str, datetime | None]:
|
||
host_venue: dict[str, str] = item.get("host_venue", {})
|
||
biblio: dict[str, str] = item.get("biblio", {})
|
||
|
||
journal: str = host_venue.get("display_name", "")
|
||
publisher: str = host_venue.get("publisher", "")
|
||
pages: str = _stringify_pages(biblio)
|
||
volume = biblio.get("volume", "")
|
||
number = biblio.get("issue", "")
|
||
published_date = _parse_date(item.get("publication_date"))
|
||
return journal, publisher, pages, volume, number, published_date
|
||
|
||
|
||
def _extract_comments(item: dict[str, t.Any]) -> str:
|
||
cited_by_count = item.get("cited_by_count")
|
||
if isinstance(cited_by_count, int):
|
||
return f"{cited_by_count} citations"
|
||
return ""
|