mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-30 18:22:31 -04:00 
			
		
		
		
	Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
		
			
				
	
	
		
			152 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			152 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # SPDX-License-Identifier: AGPL-3.0-or-later
 | |
| """PubMed_ comprises more than 39 million citations for biomedical literature
 | |
| from MEDLINE, life science journals, and online books. Citations may include
 | |
| links to full text content from PubMed Central and publisher web sites.
 | |
| 
 | |
| .. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
 | |
| 
 | |
| Configuration
 | |
| =============
 | |
| 
 | |
| .. code:: yaml
 | |
| 
 | |
|    - name: pubmed
 | |
|      engine: pubmed
 | |
|      shortcut: pub
 | |
| 
 | |
| Implementations
 | |
| ===============
 | |
| 
 | |
| """
 | |
| 
 | |
| import typing as t
 | |
| 
 | |
| from datetime import datetime
 | |
| from urllib.parse import urlencode
 | |
| 
 | |
| from lxml import etree
 | |
| 
 | |
| from searx.result_types import EngineResults
 | |
| from searx.network import get
 | |
| from searx.utils import (
 | |
|     eval_xpath_getindex,
 | |
|     eval_xpath_list,
 | |
|     extract_text,
 | |
|     ElementType,
 | |
| )
 | |
| 
 | |
| if t.TYPE_CHECKING:
 | |
|     from searx.extended_types import SXNG_Response
 | |
|     from searx.search.processors import OnlineParams
 | |
| 
 | |
| 
 | |
| about = {
 | |
|     "website": "https://www.ncbi.nlm.nih.gov/pubmed/",
 | |
|     "wikidata_id": "Q1540899",
 | |
|     "official_api_documentation": {
 | |
|         "url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
 | |
|         "comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
 | |
|     },
 | |
|     "use_official_api": True,
 | |
|     "require_api_key": False,
 | |
|     "results": "XML",
 | |
| }
 | |
| 
 | |
| categories = ["science", "scientific publications"]
 | |
| 
 | |
| eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
 | |
| 
 | |
| # engine dependent config
 | |
| number_of_results = 10
 | |
| pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
 | |
| 
 | |
| 
 | |
| def request(query: str, params: "OnlineParams") -> None:
 | |
| 
 | |
|     args = urlencode(
 | |
|         {
 | |
|             "db": "pubmed",
 | |
|             "term": query,
 | |
|             "retstart": (params["pageno"] - 1) * number_of_results,
 | |
|             "hits": number_of_results,
 | |
|         }
 | |
|     )
 | |
|     esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
 | |
|     # DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
 | |
|     esearch_resp: "SXNG_Response" = get(esearch_url)
 | |
|     pmids_results = etree.XML(esearch_resp.content)
 | |
|     pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
 | |
| 
 | |
|     # send efetch request with the IDs from esearch response
 | |
|     args = urlencode(
 | |
|         {
 | |
|             "db": "pubmed",
 | |
|             "retmode": "xml",
 | |
|             "id": ",".join(pmids),
 | |
|         }
 | |
|     )
 | |
|     efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
 | |
|     params["url"] = efetch_url
 | |
| 
 | |
| 
 | |
| def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals
 | |
| 
 | |
|     # DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
 | |
| 
 | |
|     # parse efetch response
 | |
|     efetch_xml = etree.XML(resp.content)
 | |
|     res = EngineResults()
 | |
| 
 | |
|     def _field_txt(xml: ElementType, xpath_str: str) -> str:
 | |
|         elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
 | |
|         return extract_text(elem, allow_none=True) or ""
 | |
| 
 | |
|     for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
 | |
| 
 | |
|         medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
 | |
|         pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
 | |
| 
 | |
|         title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
 | |
|         pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
 | |
|         url: str = pubmed_url + pmid
 | |
|         content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
 | |
|         doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
 | |
|         journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
 | |
|         issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
 | |
| 
 | |
|         authors: list[str] = []
 | |
| 
 | |
|         for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
 | |
|             f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
 | |
|             l = eval_xpath_getindex(author, "./LastName", 0, default=None)
 | |
|             author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
 | |
|             if author_name:
 | |
|                 authors.append(author_name)
 | |
| 
 | |
|         accepted_date = eval_xpath_getindex(
 | |
|             pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
 | |
|         )
 | |
|         pub_date = None
 | |
|         if accepted_date is not None:
 | |
|             year = eval_xpath_getindex(accepted_date, "./Year", 0)
 | |
|             month = eval_xpath_getindex(accepted_date, "./Month", 0)
 | |
|             day = eval_xpath_getindex(accepted_date, "./Day", 0)
 | |
|             try:
 | |
|                 pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
 | |
|             except ValueError:
 | |
|                 pass
 | |
| 
 | |
|         res.add(
 | |
|             res.types.Paper(
 | |
|                 url=url,
 | |
|                 title=title,
 | |
|                 content=content,
 | |
|                 journal=journal,
 | |
|                 issn=[issn],
 | |
|                 authors=authors,
 | |
|                 doi=doi,
 | |
|                 publishedDate=pub_date,
 | |
|             )
 | |
|         )
 | |
|     return res
 |