[mod] PubMed engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-09-29 15:30:51 -04:00 · 2025-09-10 16:43:42 +02:00 · 2025-09-10 16:43:42 +02:00 · bb22bb1831
commit bb22bb1831
parent 96e63df8ca
3 changed files with 123 additions and 97 deletions
--- a/docs/dev/engines/online/pubmed.rst
+++ b/docs/dev/engines/online/pubmed.rst
@ -0,0 +1,8 @@
+.. _pubmed engine:
+
+======
+PubMed
+======
+
+.. automodule:: searx.engines.pubmed
+   :members:
--- a/searx/engines/pubmed.py
+++ b/searx/engines/pubmed.py
@ -1,132 +1,151 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""PubMed (Scholar publications)
+"""PubMed_ comprises more than 39 million citations for biomedical literature
+from MEDLINE, life science journals, and online books. Citations may include
+links to full text content from PubMed Central and publisher web sites.
+
+.. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
+
+Configuration
+=============
+
+.. code:: yaml
+
+   - name: pubmed
+     engine: pubmed
+     shortcut: pub
+
+Implementations
+===============

 """

+import typing as t
+
 from datetime import datetime
 from urllib.parse import urlencode

 from lxml import etree
+
+from searx.result_types import EngineResults
 from searx.network import get
 from searx.utils import (
    eval_xpath_getindex,
    eval_xpath_list,
    extract_text,
+    ElementType,
 )

-# about
+if t.TYPE_CHECKING:
+    from searx.extended_types import SXNG_Response
+    from searx.search.processors import OnlineParams
+
+
 about = {
-    "website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
-    "wikidata_id": 'Q1540899',
+    "website": "https://www.ncbi.nlm.nih.gov/pubmed/",
+    "wikidata_id": "Q1540899",
    "official_api_documentation": {
-        'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
-        'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
+        "url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
+        "comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
    },
    "use_official_api": True,
    "require_api_key": False,
-    "results": 'XML',
+    "results": "XML",
 }

-categories = ['science', 'scientific publications']
+categories = ["science", "scientific publications"]

-base_url = (
-    'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
-)
+eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

 # engine dependent config
 number_of_results = 10
-pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
+pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"


-def request(query, params):
-    # basic search
-    offset = (params['pageno'] - 1) * number_of_results
+def request(query: str, params: "OnlineParams") -> None:

-    string_args = {
-        'query': urlencode({'term': query}),
-        'offset': offset,
-        'hits': number_of_results,
-    }
-
-    params['url'] = base_url.format(**string_args)
-
-    return params
-
-
-def response(resp):  # pylint: disable=too-many-locals
-    results = []
-
-    # First retrieve notice of each result
-    pubmed_retrieve_api_url = (
-        'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
-    )
-
-    pmids_results = etree.XML(resp.content)
-    pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
-    pmids_string = ''
-
-    for item in pmids:
-        pmids_string += item.text + ','
-
-    retrieve_notice_args = {'pmids_string': pmids_string}
-
-    retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
-
-    search_results_response = get(retrieve_url_encoded).content
-    search_results = etree.XML(search_results_response)
-    for entry in eval_xpath_list(search_results, '//PubmedArticle'):
-        medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
-
-        title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
-        pmid = eval_xpath_getindex(medline, './/PMID', 0).text
-        url = pubmed_url + pmid
-        content = extract_text(
-            eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
-        )
-        doi = extract_text(
-            eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
-        )
-        journal = extract_text(
-            eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
-        )
-        issn = extract_text(
-            eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
-        )
-        authors = []
-        for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
-            f = eval_xpath_getindex(author, './ForeName', 0, default=None)
-            l = eval_xpath_getindex(author, './LastName', 0, default=None)
-            f = '' if f is None else f.text
-            l = '' if l is None else l.text
-            authors.append((f + ' ' + l).strip())
-
-        res_dict = {
-            'template': 'paper.html',
-            'url': url,
-            'title': title,
-            'content': content or "",
-            'journal': journal,
-            'issn': [issn],
-            'authors': authors,
-            'doi': doi,
+    args = urlencode(
+        {
+            "db": "pubmed",
+            "term": query,
+            "retstart": (params["pageno"] - 1) * number_of_results,
+            "hits": number_of_results,
        }
+    )
+    esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
+    # DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
+    esearch_resp: "SXNG_Response" = get(esearch_url)
+    pmids_results = etree.XML(esearch_resp.content)
+    pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
+
+    # send efetch request with the IDs from esearch response
+    args = urlencode(
+        {
+            "db": "pubmed",
+            "retmode": "xml",
+            "id": ",".join(pmids),
+        }
+    )
+    efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
+    params["url"] = efetch_url
+
+
+def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals
+
+    # DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
+
+    # parse efetch response
+    efetch_xml = etree.XML(resp.content)
+    res = EngineResults()
+
+    def _field_txt(xml: ElementType, xpath_str: str) -> str:
+        elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
+        return extract_text(elem, allow_none=True) or ""
+
+    for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
+
+        medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
+        pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
+
+        title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
+        pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
+        url: str = pubmed_url + pmid
+        content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
+        doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
+        journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
+        issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
+
+        authors: list[str] = []
+
+        for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
+            f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
+            l = eval_xpath_getindex(author, "./LastName", 0, default=None)
+            author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
+            if author_name:
+                authors.append(author_name)

        accepted_date = eval_xpath_getindex(
-            entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
+            pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
        )
+        pub_date = None
        if accepted_date is not None:
-            year = eval_xpath_getindex(accepted_date, './Year', 0)
-            month = eval_xpath_getindex(accepted_date, './Month', 0)
-            day = eval_xpath_getindex(accepted_date, './Day', 0)
+            year = eval_xpath_getindex(accepted_date, "./Year", 0)
+            month = eval_xpath_getindex(accepted_date, "./Month", 0)
+            day = eval_xpath_getindex(accepted_date, "./Day", 0)
            try:
-                publishedDate = datetime.strptime(
-                    year.text + '-' + month.text + '-' + day.text,
-                    '%Y-%m-%d',
-                )
-                res_dict['publishedDate'] = publishedDate
-            except Exception as e:  # pylint: disable=broad-exception-caught
-                print(e)
+                pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
+            except ValueError:
+                pass

-        results.append(res_dict)
-
-    return results
+        res.add(
+            res.types.Paper(
+                url=url,
+                title=title,
+                content=content,
+                journal=journal,
+                issn=[issn],
+                authors=authors,
+                doi=doi,
+                publishedDate=pub_date,
+            )
+        )
+    return res
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -1735,7 +1735,6 @@ engines:
  - name: pubmed
    engine: pubmed
    shortcut: pub
-    timeout: 3.0

  - name: pypi
    shortcut: pypi