[mod] Google Scholar engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other
typifications.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2025-09-10 16:39:24 +02:00 committed by Markus Heiser
parent 078c9fcb68
commit 599d9488c5
2 changed files with 152 additions and 128 deletions

View File

@ -11,6 +11,8 @@ engines:
"""
import typing as t
import re
import random
import string
@ -28,8 +30,10 @@ from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
# about
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q9366',
@ -89,7 +93,7 @@ def ui_async(start: int) -> str:
return ",".join([arc_id, use_ac, _fmt])
def get_google_info(params, eng_traits):
def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[str, t.Any]:
"""Composing various (language) properties for the google engines (:ref:`google
API`).
@ -144,7 +148,7 @@ def get_google_info(params, eng_traits):
"""
ret_val = {
ret_val: dict[str, t.Any] = {
'language': None,
'country': None,
'subdomain': None,
@ -273,7 +277,7 @@ def detect_google_sorry(resp):
raise SearxEngineCaptchaException()
def request(query, params):
def request(query: str, params: "OnlineParams") -> None:
"""Google search request"""
# pylint: disable=line-too-long
start = (params['pageno'] - 1) * 10
@ -317,7 +321,6 @@ def request(query, params):
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
@ -341,7 +344,7 @@ def parse_data_images(text: str):
return data_image_map
def response(resp) -> EngineResults:
def response(resp: "SXNG_Response"):
"""Get response from google's search request"""
# pylint: disable=too-many-branches, too-many-statements
detect_google_sorry(resp)

View File

@ -1,12 +1,29 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Google Scholar engine.
"""Google Scholar is a freely accessible web search engine that indexes the full
text or metadata of scholarly literature across an array of publishing formats
and disciplines.
Compared to other Google services the Scholar engine has a simple GET REST-API
and there does not exists `async` API. Even though the API slightly vintage we
can make use of the :ref:`google API` to assemble the arguments of the GET
and there does not exists ``async`` API. Even though the API slightly vintage
we can make use of the :ref:`google API` to assemble the arguments of the GET
request.
Configuration
=============
.. code:: yaml
- name: google scholar
engine: google_scholar
shortcut: gos
Implementations
===============
"""
import typing as t
from urllib.parse import urlencode
from datetime import datetime
from lxml import html
@ -16,6 +33,7 @@ from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
ElementType,
)
from searx.exceptions import SearxEngineCaptchaException
@ -26,18 +44,23 @@ from searx.engines.google import (
time_range_dict,
)
# about
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": 'https://scholar.google.com',
"wikidata_id": 'Q494817',
"official_api_documentation": 'https://developers.google.com/custom-search',
"website": "https://scholar.google.com",
"wikidata_id": "Q494817",
"official_api_documentation": "https://developers.google.com/custom-search",
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"results": "HTML",
}
# engine dependent config
categories = ['science', 'scientific publications']
categories = ["science", "scientific publications"]
paging = True
max_page = 50
"""`Google max 50 pages`_
@ -50,9 +73,97 @@ safesearch = False
send_accept_language_header = True
def time_range_args(params):
def request(query: str, params: "OnlineParams") -> None:
"""Google-Scholar search request"""
google_info = get_google_info(params, traits)
# subdomain is: scholar.google.xy
google_info["subdomain"] = google_info["subdomain"].replace("www.", "scholar.")
args = {
"q": query,
**google_info["params"],
"start": (params["pageno"] - 1) * 10,
"as_sdt": "2007", # include patents / to disable set "0,5"
"as_vis": "0", # include citations / to disable set "1"
}
args.update(time_range_args(params))
params["url"] = "https://" + google_info["subdomain"] + "/scholar?" + urlencode(args)
params["cookies"] = google_info["cookies"]
params["headers"].update(google_info["headers"])
def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
"""Parse response from Google Scholar"""
res = EngineResults()
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results
for result in eval_xpath_list(dom, "//div[@data-rp]"):
title = extract_text(eval_xpath(result, ".//h3[1]//a"))
if not title:
# this is a [ZITATION] block
continue
pub_type: str = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) or ""
if pub_type:
pub_type = pub_type[1:-1].lower()
url: str = eval_xpath_getindex(result, ".//h3[1]//a/@href", 0)
content: str = extract_text(eval_xpath(result, ".//div[@class='gs_rs']")) or ""
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, ".//div[@class='gs_a']"))
)
if publisher in url:
publisher = ""
# cited by
comments: str = (
extract_text(eval_xpath(result, ".//div[@class='gs_fl']/a[starts-with(@href,'/scholar?cites=')]")) or ""
)
# link to the html or pdf document
html_url: str = ""
pdf_url: str = ""
doc_url = eval_xpath_getindex(result, ".//div[@class='gs_or_ggsm']/a/@href", 0, default=None)
doc_type = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']"))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
res.add(
res.types.Paper(
type=pub_type,
url=url,
title=title,
authors=authors,
publisher=publisher,
journal=journal,
publishedDate=publishedDate,
content=content,
comments=comments,
html_url=html_url,
pdf_url=pdf_url,
)
)
# parse suggestion
for suggestion in eval_xpath(dom, "//div[contains(@class, 'gs_qsuggest_wrap')]//li//a"):
res.add(res.types.LegacyResult(suggestion=extract_text(suggestion)))
for correction in eval_xpath(dom, "//div[@class='gs_r gs_pda']/a"):
res.add(res.types.LegacyResult(correction=extract_text(correction)))
return res
def time_range_args(params: "OnlineParams") -> dict[str, int]:
"""Returns a dictionary with a time range arguments based on
``params['time_range']``.
``params["time_range"]``.
Google Scholar supports a detailed search by year. Searching by *last
month* or *last week* (as offered by SearXNG) is uncommon for scientific
@ -60,21 +171,23 @@ def time_range_args(params):
To limit the result list when the users selects a range, all the SearXNG
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
is set an empty dictionary of arguments is returned. Example; when
user selects a time range (current year minus one in 2022):
is set an empty dictionary of arguments is returned.
Example; when user selects a time range and we find ourselves in the year
2025 (current year minus one):
.. code:: python
{ 'as_ylo' : 2021 }
{ "as_ylo" : 2024 }
"""
ret_val = {}
if params['time_range'] in time_range_dict:
ret_val['as_ylo'] = datetime.now().year - 1
ret_val: dict[str, int] = {}
if params["time_range"] in time_range_dict:
ret_val["as_ylo"] = datetime.now().year - 1
return ret_val
def detect_google_captcha(dom):
def detect_google_captcha(dom: ElementType):
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
not redirected to ``sorry.google.com``.
"""
@ -82,29 +195,7 @@ def detect_google_captcha(dom):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google-Scholar search request"""
google_info = get_google_info(params, traits)
# subdomain is: scholar.google.xy
google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
args = {
'q': query,
**google_info['params'],
'start': (params['pageno'] - 1) * 10,
'as_sdt': '2007', # include patents / to disable set '0,5'
'as_vis': '0', # include citations / to disable set '1'
}
args.update(time_range_args(params))
params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def parse_gs_a(text: str | None):
def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]:
"""Parse the text written in green.
Possible formats:
@ -113,98 +204,28 @@ def parse_gs_a(text: str | None):
* "{authors} - {publisher}"
"""
if text is None or text == "":
return None, None, None, None
return [], "", "", None
s_text = text.split(' - ')
authors = s_text[0].split(', ')
publisher = s_text[-1]
s_text = text.split(" - ")
authors: list[str] = s_text[0].split(", ")
publisher: str = s_text[-1]
if len(s_text) != 3:
return authors, None, publisher, None
return authors, "", publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(', ')
journal_year = s_text[1].split(", ")
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal = ', '.join(journal_year[0:-1])
if journal == '':
journal = None
journal: str = ", ".join(journal_year[0:-1])
if journal == "":
journal = ""
else:
journal = None
journal = ""
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), '%Y')
publishedDate = datetime.strptime(year.strip(), "%Y")
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate
def response(resp): # pylint: disable=too-many-locals
"""Parse response from Google Scholar"""
results = []
# convert the text to dom
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results
for result in eval_xpath_list(dom, '//div[@data-rp]'):
title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title:
# this is a [ZITATION] block
continue
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if pub_type:
pub_type = pub_type[1:-1].lower()
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
)
if publisher in url:
publisher = None
# cited by
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
# link to the html or pdf document
html_url = None
pdf_url = None
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
results.append(
{
'template': 'paper.html',
'type': pub_type,
'url': url,
'title': title,
'authors': authors,
'publisher': publisher,
'journal': journal,
'publishedDate': publishedDate,
'content': content,
'comments': comments,
'html_url': html_url,
'pdf_url': pdf_url,
}
)
# parse suggestion
for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
results.append({'correction': extract_text(correction)})
return results