[mod] PubMed engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-09-29 15:30:51 -04:00 · 2025-09-10 16:43:42 +02:00 · 2025-09-10 16:43:42 +02:00 · bb22bb1831
commit bb22bb1831
parent 96e63df8ca
3 changed files with 123 additions and 97 deletions
--- a/docs/dev/engines/online/pubmed.rst
+++ b/docs/dev/engines/online/pubmed.rst
@ -0,0 +1,8 @@
 .. _pubmed engine:
 ======
 PubMed
 ======
 .. automodule:: searx.engines.pubmed
   :members:
--- a/searx/engines/pubmed.py
+++ b/searx/engines/pubmed.py
@ -1,132 +1,151 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""PubMed (Scholar publications)
+"""PubMed_ comprises more than 39 million citations for biomedical literature
 from MEDLINE, life science journals, and online books. Citations may include
 links to full text content from PubMed Central and publisher web sites.
 .. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
 Configuration
 =============
 .. code:: yaml
   - name: pubmed
     engine: pubmed
     shortcut: pub
 Implementations
 ===============
 """
 import typing as t
 from datetime import datetime
 from urllib.parse import urlencode
 from lxml import etree
 from searx.result_types import EngineResults
 from searx.network import get
 from searx.utils import (
    eval_xpath_getindex,
    eval_xpath_list,
    extract_text,
    ElementType,
 )
-# about
+if t.TYPE_CHECKING:
    from searx.extended_types import SXNG_Response
    from searx.search.processors import OnlineParams
 about = {
-    "website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
+    "website": "https://www.ncbi.nlm.nih.gov/pubmed/",
-    "wikidata_id": 'Q1540899',
+    "wikidata_id": "Q1540899",
    "official_api_documentation": {
-        'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
+        "url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
-        'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
+        "comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
    },
    "use_official_api": True,
    "require_api_key": False,
-    "results": 'XML',
+    "results": "XML",
 }
-categories = ['science', 'scientific publications']
+categories = ["science", "scientific publications"]
-base_url = (
+eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
    'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
 )
 # engine dependent config
 number_of_results = 10
-pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
+pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
-def request(query, params):
+def request(query: str, params: "OnlineParams") -> None:
    # basic search
    offset = (params['pageno'] - 1) * number_of_results
-    string_args = {
+    args = urlencode(
-        'query': urlencode({'term': query}),
+        {
-        'offset': offset,
+            "db": "pubmed",
-        'hits': number_of_results,
+            "term": query,
-    }
+            "retstart": (params["pageno"] - 1) * number_of_results,
-
+            "hits": number_of_results,
    params['url'] = base_url.format(**string_args)
    return params
 def response(resp):  # pylint: disable=too-many-locals
    results = []
    # First retrieve notice of each result
    pubmed_retrieve_api_url = (
        'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
    )
    pmids_results = etree.XML(resp.content)
    pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
    pmids_string = ''
    for item in pmids:
        pmids_string += item.text + ','
    retrieve_notice_args = {'pmids_string': pmids_string}
    retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
    search_results_response = get(retrieve_url_encoded).content
    search_results = etree.XML(search_results_response)
    for entry in eval_xpath_list(search_results, '//PubmedArticle'):
        medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
        title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
        pmid = eval_xpath_getindex(medline, './/PMID', 0).text
        url = pubmed_url + pmid
        content = extract_text(
            eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
        )
        doi = extract_text(
            eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
        )
        journal = extract_text(
            eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
        )
        issn = extract_text(
            eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
        )
        authors = []
        for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
            f = eval_xpath_getindex(author, './ForeName', 0, default=None)
            l = eval_xpath_getindex(author, './LastName', 0, default=None)
            f = '' if f is None else f.text
            l = '' if l is None else l.text
            authors.append((f + ' ' + l).strip())
        res_dict = {
            'template': 'paper.html',
            'url': url,
            'title': title,
            'content': content or "",
            'journal': journal,
            'issn': [issn],
            'authors': authors,
            'doi': doi,
        }
    )
    esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
    # DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
    esearch_resp: "SXNG_Response" = get(esearch_url)
    pmids_results = etree.XML(esearch_resp.content)
    pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
    # send efetch request with the IDs from esearch response
    args = urlencode(
        {
            "db": "pubmed",
            "retmode": "xml",
            "id": ",".join(pmids),
        }
    )
    efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
    params["url"] = efetch_url
 def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals
    # DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
    # parse efetch response
    efetch_xml = etree.XML(resp.content)
    res = EngineResults()
    def _field_txt(xml: ElementType, xpath_str: str) -> str:
        elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
        return extract_text(elem, allow_none=True) or ""
    for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
        medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
        pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
        title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
        pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
        url: str = pubmed_url + pmid
        content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
        doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
        journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
        issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
        authors: list[str] = []
        for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
            f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
            l = eval_xpath_getindex(author, "./LastName", 0, default=None)
            author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
            if author_name:
                authors.append(author_name)
        accepted_date = eval_xpath_getindex(
-            entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
+            pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
        )
        pub_date = None
        if accepted_date is not None:
-            year = eval_xpath_getindex(accepted_date, './Year', 0)
+            year = eval_xpath_getindex(accepted_date, "./Year", 0)
-            month = eval_xpath_getindex(accepted_date, './Month', 0)
+            month = eval_xpath_getindex(accepted_date, "./Month", 0)
-            day = eval_xpath_getindex(accepted_date, './Day', 0)
+            day = eval_xpath_getindex(accepted_date, "./Day", 0)
            try:
-                publishedDate = datetime.strptime(
+                pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
-                    year.text + '-' + month.text + '-' + day.text,
+            except ValueError:
-                    '%Y-%m-%d',
+                pass
                )
                res_dict['publishedDate'] = publishedDate
            except Exception as e:  # pylint: disable=broad-exception-caught
                print(e)
-        results.append(res_dict)
+        res.add(
-
+            res.types.Paper(
-    return results
+                url=url,
                title=title,
                content=content,
                journal=journal,
                issn=[issn],
                authors=authors,
                doi=doi,
                publishedDate=pub_date,
            )
        )
    return res
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -1735,7 +1735,6 @@ engines:
  - name: pubmed
    engine: pubmed
    shortcut: pub
    timeout: 3.0
  - name: pypi
    shortcut: pypi