mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-04 03:27:06 -05:00 
			
		
		
		
	Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
		
			
				
	
	
		
			152 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			152 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
						|
"""PubMed_ comprises more than 39 million citations for biomedical literature
 | 
						|
from MEDLINE, life science journals, and online books. Citations may include
 | 
						|
links to full text content from PubMed Central and publisher web sites.
 | 
						|
 | 
						|
.. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
 | 
						|
 | 
						|
Configuration
 | 
						|
=============
 | 
						|
 | 
						|
.. code:: yaml
 | 
						|
 | 
						|
   - name: pubmed
 | 
						|
     engine: pubmed
 | 
						|
     shortcut: pub
 | 
						|
 | 
						|
Implementations
 | 
						|
===============
 | 
						|
 | 
						|
"""
 | 
						|
 | 
						|
import typing as t
 | 
						|
 | 
						|
from datetime import datetime
 | 
						|
from urllib.parse import urlencode
 | 
						|
 | 
						|
from lxml import etree
 | 
						|
 | 
						|
from searx.result_types import EngineResults
 | 
						|
from searx.network import get
 | 
						|
from searx.utils import (
 | 
						|
    eval_xpath_getindex,
 | 
						|
    eval_xpath_list,
 | 
						|
    extract_text,
 | 
						|
    ElementType,
 | 
						|
)
 | 
						|
 | 
						|
if t.TYPE_CHECKING:
 | 
						|
    from searx.extended_types import SXNG_Response
 | 
						|
    from searx.search.processors import OnlineParams
 | 
						|
 | 
						|
 | 
						|
about = {
 | 
						|
    "website": "https://www.ncbi.nlm.nih.gov/pubmed/",
 | 
						|
    "wikidata_id": "Q1540899",
 | 
						|
    "official_api_documentation": {
 | 
						|
        "url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
 | 
						|
        "comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
 | 
						|
    },
 | 
						|
    "use_official_api": True,
 | 
						|
    "require_api_key": False,
 | 
						|
    "results": "XML",
 | 
						|
}
 | 
						|
 | 
						|
categories = ["science", "scientific publications"]
 | 
						|
 | 
						|
eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
 | 
						|
 | 
						|
# engine dependent config
 | 
						|
number_of_results = 10
 | 
						|
pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
 | 
						|
 | 
						|
 | 
						|
def request(query: str, params: "OnlineParams") -> None:
 | 
						|
 | 
						|
    args = urlencode(
 | 
						|
        {
 | 
						|
            "db": "pubmed",
 | 
						|
            "term": query,
 | 
						|
            "retstart": (params["pageno"] - 1) * number_of_results,
 | 
						|
            "hits": number_of_results,
 | 
						|
        }
 | 
						|
    )
 | 
						|
    esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
 | 
						|
    # DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
 | 
						|
    esearch_resp: "SXNG_Response" = get(esearch_url)
 | 
						|
    pmids_results = etree.XML(esearch_resp.content)
 | 
						|
    pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
 | 
						|
 | 
						|
    # send efetch request with the IDs from esearch response
 | 
						|
    args = urlencode(
 | 
						|
        {
 | 
						|
            "db": "pubmed",
 | 
						|
            "retmode": "xml",
 | 
						|
            "id": ",".join(pmids),
 | 
						|
        }
 | 
						|
    )
 | 
						|
    efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
 | 
						|
    params["url"] = efetch_url
 | 
						|
 | 
						|
 | 
						|
def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals
 | 
						|
 | 
						|
    # DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
 | 
						|
 | 
						|
    # parse efetch response
 | 
						|
    efetch_xml = etree.XML(resp.content)
 | 
						|
    res = EngineResults()
 | 
						|
 | 
						|
    def _field_txt(xml: ElementType, xpath_str: str) -> str:
 | 
						|
        elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
 | 
						|
        return extract_text(elem, allow_none=True) or ""
 | 
						|
 | 
						|
    for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
 | 
						|
 | 
						|
        medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
 | 
						|
        pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
 | 
						|
 | 
						|
        title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
 | 
						|
        pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
 | 
						|
        url: str = pubmed_url + pmid
 | 
						|
        content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
 | 
						|
        doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
 | 
						|
        journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
 | 
						|
        issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
 | 
						|
 | 
						|
        authors: list[str] = []
 | 
						|
 | 
						|
        for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
 | 
						|
            f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
 | 
						|
            l = eval_xpath_getindex(author, "./LastName", 0, default=None)
 | 
						|
            author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
 | 
						|
            if author_name:
 | 
						|
                authors.append(author_name)
 | 
						|
 | 
						|
        accepted_date = eval_xpath_getindex(
 | 
						|
            pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
 | 
						|
        )
 | 
						|
        pub_date = None
 | 
						|
        if accepted_date is not None:
 | 
						|
            year = eval_xpath_getindex(accepted_date, "./Year", 0)
 | 
						|
            month = eval_xpath_getindex(accepted_date, "./Month", 0)
 | 
						|
            day = eval_xpath_getindex(accepted_date, "./Day", 0)
 | 
						|
            try:
 | 
						|
                pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
 | 
						|
            except ValueError:
 | 
						|
                pass
 | 
						|
 | 
						|
        res.add(
 | 
						|
            res.types.Paper(
 | 
						|
                url=url,
 | 
						|
                title=title,
 | 
						|
                content=content,
 | 
						|
                journal=journal,
 | 
						|
                issn=[issn],
 | 
						|
                authors=authors,
 | 
						|
                doi=doi,
 | 
						|
                publishedDate=pub_date,
 | 
						|
            )
 | 
						|
        )
 | 
						|
    return res
 |