From d8d5de4d47eeac922a0376e7e23de166610f8a8d Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 6 Oct 2025 10:12:38 +0200 Subject: [PATCH] [fix] google scholar - detect CAPTCHA (HTTP redirects) (#5268) In the case of .. response, for example, an HTTP 302 is returned by Google Scholar:: Our systems have detected unusual traffic from your computer network. Please try again later. Signed-off-by: Markus Heiser --- searx/engines/google_scholar.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 8a82b36ee..b60b257bd 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -27,6 +27,7 @@ import typing as t from urllib.parse import urlencode from datetime import datetime from lxml import html +import httpx from searx.utils import ( eval_xpath, @@ -36,7 +37,7 @@ from searx.utils import ( ElementType, ) -from searx.exceptions import SearxEngineCaptchaException +from searx.exceptions import SearxEngineCaptchaException, SearxEngineAccessDeniedException from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( @@ -97,6 +98,15 @@ def request(query: str, params: "OnlineParams") -> None: def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals """Parse response from Google Scholar""" + if resp.status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers: + if "/sorry/index?continue" in resp.headers["Location"]: + # Our systems have detected unusual traffic from your computer + # network. Please try again later. + raise SearxEngineAccessDeniedException( + message="google_scholar: unusual traffic detected", + ) + raise httpx.TooManyRedirects(f"location {resp.headers['Location'].split('?')[0]}") + res = EngineResults() dom = html.fromstring(resp.text) detect_google_captcha(dom) @@ -192,7 +202,7 @@ def detect_google_captcha(dom: ElementType): not redirected to ``sorry.google.com``. """ if eval_xpath(dom, "//form[@id='gs_captcha_f']"): - raise SearxEngineCaptchaException() + raise SearxEngineCaptchaException(message="CAPTCHA (gs_captcha_f)") def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: