[mod] PubMed engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other
typifications.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2025-09-10 16:43:42 +02:00 committed by Markus Heiser
parent 96e63df8ca
commit bb22bb1831
3 changed files with 123 additions and 97 deletions

View File

@ -0,0 +1,8 @@
.. _pubmed engine:
======
PubMed
======
.. automodule:: searx.engines.pubmed
:members:

View File

@ -1,132 +1,151 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""PubMed (Scholar publications)
"""PubMed_ comprises more than 39 million citations for biomedical literature
from MEDLINE, life science journals, and online books. Citations may include
links to full text content from PubMed Central and publisher web sites.
.. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
Configuration
=============
.. code:: yaml
- name: pubmed
engine: pubmed
shortcut: pub
Implementations
===============
"""
import typing as t
from datetime import datetime
from urllib.parse import urlencode
from lxml import etree
from searx.result_types import EngineResults
from searx.network import get
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
ElementType,
)
# about
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
"wikidata_id": 'Q1540899',
"website": "https://www.ncbi.nlm.nih.gov/pubmed/",
"wikidata_id": "Q1540899",
"official_api_documentation": {
'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
"url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
"comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
},
"use_official_api": True,
"require_api_key": False,
"results": 'XML',
"results": "XML",
}
categories = ['science', 'scientific publications']
categories = ["science", "scientific publications"]
base_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
)
eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# engine dependent config
number_of_results = 10
pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
def request(query: str, params: "OnlineParams") -> None:
string_args = {
'query': urlencode({'term': query}),
'offset': offset,
'hits': number_of_results,
}
params['url'] = base_url.format(**string_args)
return params
def response(resp): # pylint: disable=too-many-locals
results = []
# First retrieve notice of each result
pubmed_retrieve_api_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
)
pmids_results = etree.XML(resp.content)
pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
pmids_string = ''
for item in pmids:
pmids_string += item.text + ','
retrieve_notice_args = {'pmids_string': pmids_string}
retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
search_results_response = get(retrieve_url_encoded).content
search_results = etree.XML(search_results_response)
for entry in eval_xpath_list(search_results, '//PubmedArticle'):
medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
pmid = eval_xpath_getindex(medline, './/PMID', 0).text
url = pubmed_url + pmid
content = extract_text(
eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
)
doi = extract_text(
eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
)
journal = extract_text(
eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
)
issn = extract_text(
eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
)
authors = []
for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
f = eval_xpath_getindex(author, './ForeName', 0, default=None)
l = eval_xpath_getindex(author, './LastName', 0, default=None)
f = '' if f is None else f.text
l = '' if l is None else l.text
authors.append((f + ' ' + l).strip())
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'content': content or "",
'journal': journal,
'issn': [issn],
'authors': authors,
'doi': doi,
args = urlencode(
{
"db": "pubmed",
"term": query,
"retstart": (params["pageno"] - 1) * number_of_results,
"hits": number_of_results,
}
)
esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
# DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
esearch_resp: "SXNG_Response" = get(esearch_url)
pmids_results = etree.XML(esearch_resp.content)
pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
# send efetch request with the IDs from esearch response
args = urlencode(
{
"db": "pubmed",
"retmode": "xml",
"id": ",".join(pmids),
}
)
efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
params["url"] = efetch_url
def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
# DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
# parse efetch response
efetch_xml = etree.XML(resp.content)
res = EngineResults()
def _field_txt(xml: ElementType, xpath_str: str) -> str:
elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
return extract_text(elem, allow_none=True) or ""
for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
url: str = pubmed_url + pmid
content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
authors: list[str] = []
for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
l = eval_xpath_getindex(author, "./LastName", 0, default=None)
author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
if author_name:
authors.append(author_name)
accepted_date = eval_xpath_getindex(
entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
)
pub_date = None
if accepted_date is not None:
year = eval_xpath_getindex(accepted_date, './Year', 0)
month = eval_xpath_getindex(accepted_date, './Month', 0)
day = eval_xpath_getindex(accepted_date, './Day', 0)
year = eval_xpath_getindex(accepted_date, "./Year", 0)
month = eval_xpath_getindex(accepted_date, "./Month", 0)
day = eval_xpath_getindex(accepted_date, "./Day", 0)
try:
publishedDate = datetime.strptime(
year.text + '-' + month.text + '-' + day.text,
'%Y-%m-%d',
)
res_dict['publishedDate'] = publishedDate
except Exception as e: # pylint: disable=broad-exception-caught
print(e)
pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
except ValueError:
pass
results.append(res_dict)
return results
res.add(
res.types.Paper(
url=url,
title=title,
content=content,
journal=journal,
issn=[issn],
authors=authors,
doi=doi,
publishedDate=pub_date,
)
)
return res

View File

@ -1735,7 +1735,6 @@ engines:
- name: pubmed
engine: pubmed
shortcut: pub
timeout: 3.0
- name: pypi
shortcut: pypi