mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-04 03:27:06 -05:00 
			
		
		
		
	[feat] engines: add OpenAlex Works engine (#5102)
- Adds a new engine `searx/engines/openalex.py` that integrates the OpenAlex Works API to return scientific paper results using the `paper.html` template. - Uses the official API (no auth required); supports OpenAlex polite pool via `mailto`. - Adds developer docs at `docs/dev/engines/online/openalex.rst`. OpenAlex API reference: https://docs.openalex.org/how-to-use-the-api/api-overview
This commit is contained in:
		
							parent
							
								
									11ea1a8134
								
							
						
					
					
						commit
						a0ff173799
					
				
							
								
								
									
										100
									
								
								docs/dev/engines/online/openalex.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								docs/dev/engines/online/openalex.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,100 @@
 | 
				
			|||||||
 | 
					.. _openalex engine:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					=========
 | 
				
			||||||
 | 
					OpenAlex
 | 
				
			||||||
 | 
					=========
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Overview
 | 
				
			||||||
 | 
					========
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The OpenAlex engine integrates the `OpenAlex`_ Works API to return scientific paper
 | 
				
			||||||
 | 
					results using the :origin:`paper.html <searx/templates/simple/result_templates/paper.html>`
 | 
				
			||||||
 | 
					template. It is an "online" JSON engine that uses the official public API and does
 | 
				
			||||||
 | 
					not require an API key.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. _OpenAlex: https://openalex.org
 | 
				
			||||||
 | 
					.. _OpenAlex API overview: https://docs.openalex.org/how-to-use-the-api/api-overview
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Key features
 | 
				
			||||||
 | 
					------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Uses the official Works endpoint (JSON)
 | 
				
			||||||
 | 
					- Paging support via ``page`` and ``per-page``
 | 
				
			||||||
 | 
					- Relevance sorting (``sort=relevance_score:desc``)
 | 
				
			||||||
 | 
					- Language filter support (maps SearXNG language to ``filter=language:<iso2>``)
 | 
				
			||||||
 | 
					- Maps fields commonly used in scholarly results: title, authors, abstract
 | 
				
			||||||
 | 
					  (reconstructed from inverted index), journal/venue, publisher, DOI, tags
 | 
				
			||||||
 | 
					  (concepts), PDF/HTML links, pages, volume, issue, published date, and a short
 | 
				
			||||||
 | 
					  citations comment
 | 
				
			||||||
 | 
					- Supports OpenAlex "polite pool" by adding a ``mailto`` parameter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Configuration
 | 
				
			||||||
 | 
					=============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Minimal example for :origin:`settings.yml <searx/settings.yml>`:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. code:: yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   - name: openalex
 | 
				
			||||||
 | 
					     engine: openalex
 | 
				
			||||||
 | 
					     shortcut: oa
 | 
				
			||||||
 | 
					     categories: science, scientific publications
 | 
				
			||||||
 | 
					     timeout: 5.0
 | 
				
			||||||
 | 
					     # Recommended by OpenAlex: join the polite pool with an email address
 | 
				
			||||||
 | 
					     mailto: "[email protected]"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Notes
 | 
				
			||||||
 | 
					-----
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- The ``mailto`` key is optional but recommended by OpenAlex for better service.
 | 
				
			||||||
 | 
					- Language is inherited from the user's UI language; when it is not ``all``, the
 | 
				
			||||||
 | 
					  engine adds ``filter=language:<iso2>`` (e.g. ``language:fr``). If OpenAlex has
 | 
				
			||||||
 | 
					  few results for that language, you may see fewer items.
 | 
				
			||||||
 | 
					- Results typically include a main link. When the primary landing page from
 | 
				
			||||||
 | 
					  OpenAlex is a DOI resolver, the engine will use that stable link. When an open
 | 
				
			||||||
 | 
					  access link is available, it is exposed via the ``PDF`` and/or ``HTML`` links
 | 
				
			||||||
 | 
					  in the result footer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					What is returned
 | 
				
			||||||
 | 
					================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Each result uses the ``paper.html`` template and may include:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- ``title`` and ``content`` (abstract; reconstructed from the inverted index)
 | 
				
			||||||
 | 
					- ``authors`` (display names)
 | 
				
			||||||
 | 
					- ``journal`` (host venue display name) and ``publisher``
 | 
				
			||||||
 | 
					- ``doi`` (normalized to the plain DOI, without the ``https://doi.org/`` prefix)
 | 
				
			||||||
 | 
					- ``tags`` (OpenAlex concepts display names)
 | 
				
			||||||
 | 
					- ``pdf_url`` (Open access PDF if available) and ``html_url`` (landing page)
 | 
				
			||||||
 | 
					- ``publishedDate`` (parsed from ``publication_date``)
 | 
				
			||||||
 | 
					- ``pages``, ``volume``, ``number`` (issue)
 | 
				
			||||||
 | 
					- ``type`` and a brief ``comments`` string with citation count
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Rate limits & polite pool
 | 
				
			||||||
 | 
					=========================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					OpenAlex offers a free public API with generous daily limits. For extra courtesy
 | 
				
			||||||
 | 
					and improved service quality, include a contact email in each request via
 | 
				
			||||||
 | 
					``mailto``. You can set it directly in the engine configuration as shown above.
 | 
				
			||||||
 | 
					See: `OpenAlex API overview`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Troubleshooting
 | 
				
			||||||
 | 
					===============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Few or no results in a non-English UI language:
 | 
				
			||||||
 | 
					  Ensure the selected language has sufficient coverage at OpenAlex, or set the
 | 
				
			||||||
 | 
					  UI language to English and retry.
 | 
				
			||||||
 | 
					- Preference changes fail while testing locally:
 | 
				
			||||||
 | 
					  Make sure your ``server.secret_key`` and ``server.base_url`` are set in your
 | 
				
			||||||
 | 
					  instance settings so signed cookies work; see :ref:`settings server`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Implementation
 | 
				
			||||||
 | 
					===============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. automodule:: searx.engines.openalex
 | 
				
			||||||
 | 
					   :members:
 | 
				
			||||||
							
								
								
									
										205
									
								
								searx/engines/openalex.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										205
									
								
								searx/engines/openalex.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,205 @@
 | 
				
			|||||||
 | 
					# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
				
			||||||
 | 
					# pylint: disable=missing-module-docstring
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Engine is documented in: docs/dev/engines/online/openalex.rst
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import annotations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typing as t
 | 
				
			||||||
 | 
					from datetime import datetime
 | 
				
			||||||
 | 
					from urllib.parse import urlencode
 | 
				
			||||||
 | 
					from searx.result_types import EngineResults
 | 
				
			||||||
 | 
					from searx.extended_types import SXNG_Response
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# about
 | 
				
			||||||
 | 
					about = {
 | 
				
			||||||
 | 
					    "website": "https://openalex.org/",
 | 
				
			||||||
 | 
					    "wikidata_id": "Q110718454",
 | 
				
			||||||
 | 
					    "official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview",
 | 
				
			||||||
 | 
					    "use_official_api": True,
 | 
				
			||||||
 | 
					    "require_api_key": False,
 | 
				
			||||||
 | 
					    "results": "JSON",
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# engine dependent config
 | 
				
			||||||
 | 
					categories = ["science", "scientific publications"]
 | 
				
			||||||
 | 
					paging = True
 | 
				
			||||||
 | 
					search_url = "https://api.openalex.org/works"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Optional: include your email for OpenAlex polite pool. Can be set from settings.yml
 | 
				
			||||||
 | 
					# engines: - name: openalex; engine: openalex; mailto: "[email protected]"
 | 
				
			||||||
 | 
					mailto = ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def request(query: str, params: dict[str, t.Any]) -> None:
 | 
				
			||||||
 | 
					    # Build OpenAlex query using search parameter and paging
 | 
				
			||||||
 | 
					    args = {
 | 
				
			||||||
 | 
					        "search": query,
 | 
				
			||||||
 | 
					        "page": params["pageno"],
 | 
				
			||||||
 | 
					        # keep result size moderate; OpenAlex default is 25
 | 
				
			||||||
 | 
					        "per-page": 10,
 | 
				
			||||||
 | 
					        # relevance sorting works only with `search`
 | 
				
			||||||
 | 
					        "sort": "relevance_score:desc",
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Language filter (expects ISO639-1 like 'fr', 'en')
 | 
				
			||||||
 | 
					    language = params.get("language")
 | 
				
			||||||
 | 
					    filters: list[str] = []
 | 
				
			||||||
 | 
					    if isinstance(language, str) and language != "all":
 | 
				
			||||||
 | 
					        iso2 = language.split("-")[0].split("_")[0]
 | 
				
			||||||
 | 
					        if len(iso2) == 2:
 | 
				
			||||||
 | 
					            filters.append(f"language:{iso2}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if filters:
 | 
				
			||||||
 | 
					        args["filter"] = ",".join(filters)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # include mailto if configured for polite pool (engine module setting)
 | 
				
			||||||
 | 
					    if isinstance(mailto, str) and mailto != "":
 | 
				
			||||||
 | 
					        args["mailto"] = mailto
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    params["url"] = f"{search_url}?{urlencode(args)}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def response(resp: SXNG_Response) -> EngineResults:
 | 
				
			||||||
 | 
					    data = resp.json()
 | 
				
			||||||
 | 
					    res = EngineResults()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for item in data.get("results", []):
 | 
				
			||||||
 | 
					        url, html_url, pdf_url = _extract_links(item)
 | 
				
			||||||
 | 
					        title: str = item.get("title", "")
 | 
				
			||||||
 | 
					        content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or ""
 | 
				
			||||||
 | 
					        authors = _extract_authors(item)
 | 
				
			||||||
 | 
					        journal, publisher, pages, volume, number, published_date = _extract_biblio(item)
 | 
				
			||||||
 | 
					        doi = _doi_to_plain(item.get("doi"))
 | 
				
			||||||
 | 
					        tags = _extract_tags(item) or None
 | 
				
			||||||
 | 
					        comments = _extract_comments(item)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        res.add(
 | 
				
			||||||
 | 
					            res.types.LegacyResult(
 | 
				
			||||||
 | 
					                template="paper.html",
 | 
				
			||||||
 | 
					                url=url,
 | 
				
			||||||
 | 
					                title=title,
 | 
				
			||||||
 | 
					                content=content,
 | 
				
			||||||
 | 
					                journal=journal,
 | 
				
			||||||
 | 
					                publisher=publisher,
 | 
				
			||||||
 | 
					                doi=doi,
 | 
				
			||||||
 | 
					                tags=tags,
 | 
				
			||||||
 | 
					                authors=authors,
 | 
				
			||||||
 | 
					                pdf_url=pdf_url,
 | 
				
			||||||
 | 
					                html_url=html_url,
 | 
				
			||||||
 | 
					                publishedDate=published_date,
 | 
				
			||||||
 | 
					                pages=pages,
 | 
				
			||||||
 | 
					                volume=volume,
 | 
				
			||||||
 | 
					                number=number,
 | 
				
			||||||
 | 
					                type=item.get("type"),
 | 
				
			||||||
 | 
					                comments=comments,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _stringify_pages(biblio: dict[str, t.Any]) -> str | None:
 | 
				
			||||||
 | 
					    first_page = biblio.get("first_page")
 | 
				
			||||||
 | 
					    last_page = biblio.get("last_page")
 | 
				
			||||||
 | 
					    if first_page and last_page:
 | 
				
			||||||
 | 
					        return f"{first_page}-{last_page}"
 | 
				
			||||||
 | 
					    if first_page:
 | 
				
			||||||
 | 
					        return str(first_page)
 | 
				
			||||||
 | 
					    if last_page:
 | 
				
			||||||
 | 
					        return str(last_page)
 | 
				
			||||||
 | 
					    return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _parse_date(value: str | None) -> datetime | None:
 | 
				
			||||||
 | 
					    if not value:
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    # OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD
 | 
				
			||||||
 | 
					    for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            return datetime.strptime(value, fmt)
 | 
				
			||||||
 | 
					        except ValueError:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					    return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _doi_to_plain(doi_value: str | None) -> str | None:
 | 
				
			||||||
 | 
					    if not doi_value:
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    # OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd
 | 
				
			||||||
 | 
					    return doi_value.removeprefix("https://doi.org/")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _reconstruct_abstract(
 | 
				
			||||||
 | 
					    abstract_inverted_index: dict[str, list[int]] | None,
 | 
				
			||||||
 | 
					) -> str | None:
 | 
				
			||||||
 | 
					    # The abstract is returned as an inverted index {token: [positions...]}
 | 
				
			||||||
 | 
					    # Reconstruct by placing tokens at their positions and joining with spaces.
 | 
				
			||||||
 | 
					    if not abstract_inverted_index:
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    position_to_token: dict[int, str] = {}
 | 
				
			||||||
 | 
					    max_index = -1
 | 
				
			||||||
 | 
					    for token, positions in abstract_inverted_index.items():
 | 
				
			||||||
 | 
					        for pos in positions:
 | 
				
			||||||
 | 
					            position_to_token[pos] = token
 | 
				
			||||||
 | 
					            max_index = max(max_index, pos)
 | 
				
			||||||
 | 
					    if max_index < 0:
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)]
 | 
				
			||||||
 | 
					    # collapse multiple empty tokens
 | 
				
			||||||
 | 
					    text = " ".join(t for t in ordered_tokens if t != "")
 | 
				
			||||||
 | 
					    return text if text != "" else None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _extract_links(item: dict[str, t.Any]) -> tuple[str, str | None, str | None]:
 | 
				
			||||||
 | 
					    primary_location = item.get("primary_location", {})
 | 
				
			||||||
 | 
					    landing_page_url: str | None = primary_location.get("landing_page_url")
 | 
				
			||||||
 | 
					    work_url: str = item.get("id", "")
 | 
				
			||||||
 | 
					    url: str = landing_page_url or work_url
 | 
				
			||||||
 | 
					    open_access = item.get("open_access", {})
 | 
				
			||||||
 | 
					    pdf_url: str | None = primary_location.get("pdf_url") or open_access.get("oa_url")
 | 
				
			||||||
 | 
					    html_url: str | None = landing_page_url
 | 
				
			||||||
 | 
					    return url, html_url, pdf_url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _extract_authors(item: dict[str, t.Any]) -> list[str]:
 | 
				
			||||||
 | 
					    authors: list[str] = []
 | 
				
			||||||
 | 
					    for auth in item.get("authorships", []):
 | 
				
			||||||
 | 
					        if not auth:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        author_obj = auth.get("author", {})
 | 
				
			||||||
 | 
					        display_name = author_obj.get("display_name")
 | 
				
			||||||
 | 
					        if isinstance(display_name, str) and display_name != "":
 | 
				
			||||||
 | 
					            authors.append(display_name)
 | 
				
			||||||
 | 
					    return authors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _extract_tags(item: dict[str, t.Any]) -> list[str]:
 | 
				
			||||||
 | 
					    tags: list[str] = []
 | 
				
			||||||
 | 
					    for c in item.get("concepts", []):
 | 
				
			||||||
 | 
					        name = (c or {}).get("display_name")
 | 
				
			||||||
 | 
					        if isinstance(name, str) and name != "":
 | 
				
			||||||
 | 
					            tags.append(name)
 | 
				
			||||||
 | 
					    return tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _extract_biblio(
 | 
				
			||||||
 | 
					    item: dict[str, t.Any],
 | 
				
			||||||
 | 
					) -> tuple[str | None, str | None, str | None, str | None, str | None, datetime | None]:
 | 
				
			||||||
 | 
					    host_venue = item.get("host_venue", {})
 | 
				
			||||||
 | 
					    biblio = item.get("biblio", {})
 | 
				
			||||||
 | 
					    journal: str | None = host_venue.get("display_name")
 | 
				
			||||||
 | 
					    publisher: str | None = host_venue.get("publisher")
 | 
				
			||||||
 | 
					    pages = _stringify_pages(biblio)
 | 
				
			||||||
 | 
					    volume = biblio.get("volume")
 | 
				
			||||||
 | 
					    number = biblio.get("issue")
 | 
				
			||||||
 | 
					    published_date = _parse_date(item.get("publication_date"))
 | 
				
			||||||
 | 
					    return journal, publisher, pages, volume, number, published_date
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _extract_comments(item: dict[str, t.Any]) -> str | None:
 | 
				
			||||||
 | 
					    cited_by_count = item.get("cited_by_count")
 | 
				
			||||||
 | 
					    if isinstance(cited_by_count, int):
 | 
				
			||||||
 | 
					        return f"{cited_by_count} citations"
 | 
				
			||||||
 | 
					    return None
 | 
				
			||||||
@ -1495,6 +1495,15 @@ engines:
 | 
				
			|||||||
      require_api_key: false
 | 
					      require_api_key: false
 | 
				
			||||||
      results: JSON
 | 
					      results: JSON
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  - name: openalex
 | 
				
			||||||
 | 
					    engine: openalex
 | 
				
			||||||
 | 
					    shortcut: oa
 | 
				
			||||||
 | 
					    # https://docs.searxng.org/dev/engines/online/openalex.html
 | 
				
			||||||
 | 
					    # Recommended by OpenAlex: join the polite pool with an email address
 | 
				
			||||||
 | 
					    # mailto: "[email protected]"
 | 
				
			||||||
 | 
					    timeout: 5.0
 | 
				
			||||||
 | 
					    disabled: true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  - name: openclipart
 | 
					  - name: openclipart
 | 
				
			||||||
    engine: openclipart
 | 
					    engine: openclipart
 | 
				
			||||||
    shortcut: ocl
 | 
					    shortcut: ocl
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user