From 3ec6d65f9b74c36cd9a7b692b0075557550e870f Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 10 Sep 2025 16:34:30 +0200 Subject: [PATCH] [mod] CORE engine: revision of the engine (Paper result) Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser --- searx/engines/core.py | 148 ++++++++++++++++++++++-------------------- searx/settings.yml | 12 ++-- 2 files changed, 85 insertions(+), 75 deletions(-) diff --git a/searx/engines/core.py b/searx/engines/core.py index 489b6252b..0da931792 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -5,6 +5,10 @@ research from repositories and journals. .. _CORE: https://core.ac.uk/about +.. note:: + + The CORE engine requires an :py:obj:`API key `. + .. _core engine config: Configuration @@ -17,135 +21,141 @@ The engine has the following additional settings: .. code:: yaml - name: core.ac.uk - engine: core - categories: science - shortcut: cor api_key: "..." - timeout: 5 + inactive: false Implementations =============== """ -# pylint: disable=too-many-branches + +import typing as t from datetime import datetime from urllib.parse import urlencode -from searx.exceptions import SearxEngineAPIException +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + about = { - "website": 'https://core.ac.uk', - "wikidata_id": 'Q22661180', - "official_api_documentation": 'https://api.core.ac.uk/docs/v3', + "website": "https://core.ac.uk", + "wikidata_id": "Q22661180", + "official_api_documentation": "https://api.core.ac.uk/docs/v3", "use_official_api": True, "require_api_key": True, - "results": 'JSON', + "results": "JSON", } -api_key = 'unset' +api_key = "" """For an API key register at https://core.ac.uk/services/api and insert the API key in the engine :ref:`core engine config`.""" -categories = ['science', 'scientific publications'] +categories = ["science", "scientific publications"] paging = True nb_per_page = 10 -base_url = 'https://api.core.ac.uk/v3/search/works/' +base_url = "https://api.core.ac.uk/v3/search/works/" -def request(query, params): - if api_key == 'unset': - raise SearxEngineAPIException('missing CORE API key') +def setup(engine_settings: dict[str, t.Any]) -> bool: + """Initialization of the CORE_ engine, checks whether the :py:obj:`api_key` + is set, otherwise the engine is inactive. + """ + + key: str = engine_settings.get("api_key", "") + if key and key not in ("unset", "unknown", "..."): + return True + logger.error("CORE's API key is not set or invalid.") + return False + + +def request(query: str, params: "OnlineParams") -> None: # API v3 uses different parameters search_params = { - 'q': query, - 'offset': (params['pageno'] - 1) * nb_per_page, - 'limit': nb_per_page, - 'sort': 'relevance', + "q": query, + "offset": (params["pageno"] - 1) * nb_per_page, + "limit": nb_per_page, + "sort": "relevance", } - params['url'] = base_url + '?' + urlencode(search_params) - params['headers'] = {'Authorization': f'Bearer {api_key}'} - - return params + params["url"] = base_url + "?" + urlencode(search_params) + params["headers"] = {"Authorization": f"Bearer {api_key}"} -def response(resp): - results = [] +def response(resp: "SXNG_Response") -> EngineResults: + # pylint: disable=too-many-branches + res = EngineResults() json_data = resp.json() - for result in json_data.get('results', []): + for result in json_data.get("results", []): # Get title - if not result.get('title'): + if not result.get("title"): continue # Get URL - try different options - url = None + url: str | None = None # Try DOI first - doi = result.get('doi') + doi: str = result.get("doi") if doi: - url = f'https://doi.org/{doi}' + url = f"https://doi.org/{doi}" - if url is None and result.get('doi'): + if url is None and result.get("doi"): # use the DOI reference - url = 'https://doi.org/' + str(result['doi']) - elif result.get('id'): - url = 'https://core.ac.uk/works/' + str(result['id']) - elif result.get('downloadUrl'): - url = result['downloadUrl'] - elif result.get('sourceFulltextUrls'): - url = result['sourceFulltextUrls'] + url = "https://doi.org/" + str(result["doi"]) + elif result.get("id"): + url = "https://core.ac.uk/works/" + str(result["id"]) + elif result.get("downloadUrl"): + url = result["downloadUrl"] + elif result.get("sourceFulltextUrls"): + url = result["sourceFulltextUrls"] else: continue # Published date published_date = None - raw_date = result.get('publishedDate') or result.get('depositedDate') + raw_date = result.get("publishedDate") or result.get("depositedDate") if raw_date: try: - published_date = datetime.fromisoformat(result['publishedDate'].replace('Z', '+00:00')) + published_date = datetime.fromisoformat(result["publishedDate"].replace("Z", "+00:00")) except (ValueError, AttributeError): pass # Handle journals journals = [] - if result.get('journals'): - journals = [j.get('title') for j in result['journals'] if j.get('title')] + if result.get("journals"): + journals = [j.get("title") for j in result["journals"] if j.get("title")] # Handle publisher - publisher = result.get('publisher', '').strip("'") - if publisher: - publisher = publisher.strip("'") + publisher = result.get("publisher", "").strip("'") # Handle authors - authors = set() - for i in result.get('authors', []): - name = i.get("name") + authors: set[str] = set() + for i in result.get("authors", []): + name: str | None = i.get("name") if name: authors.add(name) - results.append( - { - 'template': 'paper.html', - 'title': result.get('title'), - 'url': url, - 'content': result.get('fullText', '') or '', - # 'comments': '', - 'tags': result.get('fieldOfStudy', []), - 'publishedDate': published_date, - 'type': result.get('documentType', '') or '', - 'authors': authors, - 'editor': ', '.join(result.get('contributors', [])), - 'publisher': publisher, - 'journal': ', '.join(journals), - 'doi': result.get('doi'), - # 'issn' : '' - # 'isbn' : '' - 'pdf_url': result.get('downloadUrl', {}) or result.get("sourceFulltextUrls", {}), - } + res.add( + res.types.Paper( + title=result.get("title"), + url=url, + content=result.get("fullText", "") or "", + tags=result.get("fieldOfStudy", []), + publishedDate=published_date, + type=result.get("documentType", "") or "", + authors=authors, + editor=", ".join(result.get("contributors", [])), + publisher=publisher, + journal=", ".join(journals), + doi=result.get("doi"), + pdf_url=result.get("downloadUrl", {}) or result.get("sourceFulltextUrls", {}), + ) ) - return results + return res diff --git a/searx/settings.yml b/searx/settings.yml index 7d4d98c95..e34f501d2 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -659,12 +659,12 @@ engines: timeout: 30 disabled: true - # - name: core.ac.uk - # engine: core - # categories: science - # shortcut: cor - # # get your API key from: https://core.ac.uk/api-keys/register/ - # api_key: 'unset' + - name: core.ac.uk + engine: core + shortcut: cor + # read https://docs.searxng.org/dev/engines/online/core.html + api_key: "" + inactive: true - name: cppreference engine: cppreference