mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-30 18:22:31 -04:00 
			
		
		
		
	Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
		
			
				
	
	
		
			297 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			297 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # SPDX-License-Identifier: AGPL-3.0-or-later
 | ||
| """The OpenAlex engine integrates the `OpenAlex`_ Works API to return scientific
 | ||
| paper results using the :ref:`result_types.paper` class.  It is an "online" JSON
 | ||
| engine that uses the official public API and does not require an API key.
 | ||
| 
 | ||
| .. _OpenAlex: https://openalex.org
 | ||
| .. _OpenAlex API overview: https://docs.openalex.org/how-to-use-the-api/api-overview
 | ||
| 
 | ||
| Key features
 | ||
| ------------
 | ||
| 
 | ||
| - Uses the official Works endpoint (JSON)
 | ||
| - Paging support via ``page`` and ``per-page``
 | ||
| - Relevance sorting (``sort=relevance_score:desc``)
 | ||
| - Language filter support (maps SearXNG language to ``filter=language:<iso2>``)
 | ||
| - Maps fields commonly used in scholarly results: title, authors, abstract
 | ||
|   (reconstructed from inverted index), journal/venue, publisher, DOI, tags
 | ||
|   (concepts), PDF/HTML links, pages, volume, issue, published date, and a short
 | ||
|   citations comment
 | ||
| - Supports OpenAlex "polite pool" by adding a ``mailto`` parameter
 | ||
| 
 | ||
| 
 | ||
| Configuration
 | ||
| =============
 | ||
| 
 | ||
| Minimal example for :origin:`settings.yml <searx/settings.yml>`:
 | ||
| 
 | ||
| .. code:: yaml
 | ||
| 
 | ||
|    - name: openalex
 | ||
|      engine: openalex
 | ||
|      shortcut: oa
 | ||
|      categories: science, scientific publications
 | ||
|      timeout: 5.0
 | ||
|      # Recommended by OpenAlex: join the polite pool with an email address
 | ||
|      mailto: "[email protected]"
 | ||
| 
 | ||
| Notes
 | ||
| -----
 | ||
| 
 | ||
| - The ``mailto`` key is optional but recommended by OpenAlex for better service.
 | ||
| - Language is inherited from the user's UI language; when it is not ``all``, the
 | ||
|   engine adds ``filter=language:<iso2>`` (e.g. ``language:fr``). If OpenAlex has
 | ||
|   few results for that language, you may see fewer items.
 | ||
| - Results typically include a main link. When the primary landing page from
 | ||
|   OpenAlex is a DOI resolver, the engine will use that stable link. When an open
 | ||
|   access link is available, it is exposed via the ``PDF`` and/or ``HTML`` links
 | ||
|   in the result footer.
 | ||
| 
 | ||
| 
 | ||
| What is returned
 | ||
| ================
 | ||
| 
 | ||
| Each result uses the :ref:`result_types.paper` class and may include:
 | ||
| 
 | ||
| - ``title`` and ``content`` (abstract; reconstructed from the inverted index)
 | ||
| - ``authors`` (display names)
 | ||
| - ``journal`` (host venue display name) and ``publisher``
 | ||
| - ``doi`` (normalized to the plain DOI, without the ``https://doi.org/`` prefix)
 | ||
| - ``tags`` (OpenAlex concepts display names)
 | ||
| - ``pdf_url`` (Open access PDF if available) and ``html_url`` (landing page)
 | ||
| - ``publishedDate`` (parsed from ``publication_date``)
 | ||
| - ``pages``, ``volume``, ``number`` (issue)
 | ||
| - ``type`` and a brief ``comments`` string with citation count
 | ||
| 
 | ||
| 
 | ||
| Rate limits & polite pool
 | ||
| =========================
 | ||
| 
 | ||
| OpenAlex offers a free public API with generous daily limits. For extra courtesy
 | ||
| and improved service quality, include a contact email in each request via
 | ||
| ``mailto``. You can set it directly in the engine configuration as shown above.
 | ||
| See: `OpenAlex API overview`_.
 | ||
| 
 | ||
| 
 | ||
| Troubleshooting
 | ||
| ===============
 | ||
| 
 | ||
| - Few or no results in a non-English UI language:
 | ||
|   Ensure the selected language has sufficient coverage at OpenAlex, or set the
 | ||
|   UI language to English and retry.
 | ||
| - Preference changes fail while testing locally:
 | ||
|   Make sure your ``server.secret_key`` and ``server.base_url`` are set in your
 | ||
|   instance settings so signed cookies work; see :ref:`settings server`.
 | ||
| 
 | ||
| 
 | ||
| Implementation
 | ||
| ===============
 | ||
| 
 | ||
| """
 | ||
| 
 | ||
| import typing as t
 | ||
| 
 | ||
| from datetime import datetime
 | ||
| from urllib.parse import urlencode
 | ||
| from searx.result_types import EngineResults
 | ||
| 
 | ||
| if t.TYPE_CHECKING:
 | ||
|     from searx.extended_types import SXNG_Response
 | ||
|     from searx.search.processors import OnlineParams
 | ||
| 
 | ||
| # about
 | ||
| about = {
 | ||
|     "website": "https://openalex.org/",
 | ||
|     "wikidata_id": "Q110718454",
 | ||
|     "official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview",
 | ||
|     "use_official_api": True,
 | ||
|     "require_api_key": False,
 | ||
|     "results": "JSON",
 | ||
| }
 | ||
| 
 | ||
| 
 | ||
| # engine dependent config
 | ||
| categories = ["science", "scientific publications"]
 | ||
| paging = True
 | ||
| search_url = "https://api.openalex.org/works"
 | ||
| 
 | ||
| # Optional: include your email for OpenAlex polite pool. Can be set from settings.yml
 | ||
| # engines: - name: openalex; engine: openalex; mailto: "[email protected]"
 | ||
| mailto = ""
 | ||
| 
 | ||
| 
 | ||
| def request(query: str, params: "OnlineParams") -> None:
 | ||
|     # Build OpenAlex query using search parameter and paging
 | ||
|     args = {
 | ||
|         "search": query,
 | ||
|         "page": params["pageno"],
 | ||
|         # keep result size moderate; OpenAlex default is 25
 | ||
|         "per-page": 10,
 | ||
|         # relevance sorting works only with `search`
 | ||
|         "sort": "relevance_score:desc",
 | ||
|     }
 | ||
| 
 | ||
|     # Language filter (expects ISO639-1 like 'fr', 'en')
 | ||
|     language = params.get("language")
 | ||
|     filters: list[str] = []
 | ||
|     if isinstance(language, str) and language != "all":
 | ||
|         iso2 = language.split("-")[0].split("_")[0]
 | ||
|         if len(iso2) == 2:
 | ||
|             filters.append(f"language:{iso2}")
 | ||
| 
 | ||
|     if filters:
 | ||
|         args["filter"] = ",".join(filters)
 | ||
| 
 | ||
|     # include mailto if configured for polite pool (engine module setting)
 | ||
|     if isinstance(mailto, str) and mailto != "":
 | ||
|         args["mailto"] = mailto
 | ||
| 
 | ||
|     params["url"] = f"{search_url}?{urlencode(args)}"
 | ||
| 
 | ||
| 
 | ||
| def response(resp: "SXNG_Response") -> EngineResults:
 | ||
|     data = resp.json()
 | ||
|     res = EngineResults()
 | ||
| 
 | ||
|     for item in data.get("results", []):
 | ||
|         url, html_url, pdf_url = _extract_links(item)
 | ||
|         title: str = item.get("title", "")
 | ||
|         content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or ""
 | ||
|         authors = _extract_authors(item)
 | ||
|         journal, publisher, pages, volume, number, published_date = _extract_biblio(item)
 | ||
|         doi = _doi_to_plain(item.get("doi"))
 | ||
|         tags = _extract_tags(item)
 | ||
|         comments = _extract_comments(item)
 | ||
| 
 | ||
|         res.add(
 | ||
|             res.types.Paper(
 | ||
|                 url=url,
 | ||
|                 title=title,
 | ||
|                 content=content,
 | ||
|                 journal=journal,
 | ||
|                 publisher=publisher,
 | ||
|                 doi=doi,
 | ||
|                 tags=tags,
 | ||
|                 authors=authors,
 | ||
|                 pdf_url=pdf_url,
 | ||
|                 html_url=html_url,
 | ||
|                 publishedDate=published_date,
 | ||
|                 pages=pages,
 | ||
|                 volume=volume,
 | ||
|                 number=number,
 | ||
|                 type=item.get("type"),
 | ||
|                 comments=comments,
 | ||
|             )
 | ||
|         )
 | ||
| 
 | ||
|     return res
 | ||
| 
 | ||
| 
 | ||
| def _stringify_pages(biblio: dict[str, t.Any]) -> str:
 | ||
|     first_page = biblio.get("first_page")
 | ||
|     last_page = biblio.get("last_page")
 | ||
|     if first_page and last_page:
 | ||
|         return f"{first_page}-{last_page}"
 | ||
|     if first_page:
 | ||
|         return str(first_page)
 | ||
|     if last_page:
 | ||
|         return str(last_page)
 | ||
|     return ""
 | ||
| 
 | ||
| 
 | ||
| def _parse_date(value: str | None) -> datetime | None:
 | ||
|     if not value:
 | ||
|         return None
 | ||
|     # OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD
 | ||
|     for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
 | ||
|         try:
 | ||
|             return datetime.strptime(value, fmt)
 | ||
|         except ValueError:
 | ||
|             continue
 | ||
|     return None
 | ||
| 
 | ||
| 
 | ||
| def _doi_to_plain(doi_value: str | None) -> str:
 | ||
|     if not doi_value:
 | ||
|         return ""
 | ||
|     # OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd
 | ||
|     return doi_value.removeprefix("https://doi.org/")
 | ||
| 
 | ||
| 
 | ||
| def _reconstruct_abstract(
 | ||
|     abstract_inverted_index: dict[str, list[int]] | None,
 | ||
| ) -> str | None:
 | ||
|     # The abstract is returned as an inverted index {token: [positions...]}
 | ||
|     # Reconstruct by placing tokens at their positions and joining with spaces.
 | ||
|     if not abstract_inverted_index:
 | ||
|         return None
 | ||
|     position_to_token: dict[int, str] = {}
 | ||
|     max_index = -1
 | ||
|     for token, positions in abstract_inverted_index.items():
 | ||
|         for pos in positions:
 | ||
|             position_to_token[pos] = token
 | ||
|             max_index = max(max_index, pos)
 | ||
|     if max_index < 0:
 | ||
|         return None
 | ||
|     ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)]
 | ||
|     # collapse multiple empty tokens
 | ||
|     text = " ".join(t for t in ordered_tokens if t != "")
 | ||
|     return text if text != "" else None
 | ||
| 
 | ||
| 
 | ||
| def _extract_links(item: dict[str, t.Any]) -> tuple[str, str, str]:
 | ||
|     primary_location: dict[str, str] = item.get("primary_location", {})
 | ||
|     open_access: dict[str, str] = item.get("open_access", {})
 | ||
| 
 | ||
|     landing_page_url: str = primary_location.get("landing_page_url") or ""
 | ||
|     work_url: str = item.get("id", "")
 | ||
| 
 | ||
|     url: str = landing_page_url or work_url
 | ||
|     html_url: str = landing_page_url
 | ||
|     pdf_url: str = primary_location.get("pdf_url") or open_access.get("oa_url") or ""
 | ||
| 
 | ||
|     return url, html_url, pdf_url
 | ||
| 
 | ||
| 
 | ||
| def _extract_authors(item: dict[str, t.Any]) -> list[str]:
 | ||
|     authors: list[str] = []
 | ||
|     for auth in item.get("authorships", []):
 | ||
|         if not auth:
 | ||
|             continue
 | ||
|         author_obj = auth.get("author", {})
 | ||
|         display_name = author_obj.get("display_name")
 | ||
|         if isinstance(display_name, str) and display_name != "":
 | ||
|             authors.append(display_name)
 | ||
|     return authors
 | ||
| 
 | ||
| 
 | ||
| def _extract_tags(item: dict[str, t.Any]) -> list[str]:
 | ||
|     tags: list[str] = []
 | ||
|     for c in item.get("concepts", []):
 | ||
|         name = (c or {}).get("display_name")
 | ||
|         if isinstance(name, str) and name != "":
 | ||
|             tags.append(name)
 | ||
|     return tags
 | ||
| 
 | ||
| 
 | ||
| def _extract_biblio(
 | ||
|     item: dict[str, t.Any],
 | ||
| ) -> tuple[str, str, str, str, str, datetime | None]:
 | ||
|     host_venue: dict[str, str] = item.get("host_venue", {})
 | ||
|     biblio: dict[str, str] = item.get("biblio", {})
 | ||
| 
 | ||
|     journal: str = host_venue.get("display_name", "")
 | ||
|     publisher: str = host_venue.get("publisher", "")
 | ||
|     pages: str = _stringify_pages(biblio)
 | ||
|     volume = biblio.get("volume", "")
 | ||
|     number = biblio.get("issue", "")
 | ||
|     published_date = _parse_date(item.get("publication_date"))
 | ||
|     return journal, publisher, pages, volume, number, published_date
 | ||
| 
 | ||
| 
 | ||
| def _extract_comments(item: dict[str, t.Any]) -> str:
 | ||
|     cited_by_count = item.get("cited_by_count")
 | ||
|     if isinstance(cited_by_count, int):
 | ||
|         return f"{cited_by_count} citations"
 | ||
|     return ""
 |