[fix] google scholar - detect CAPTCHA (HTTP redirects) (#5268)

In the case of .. response, for example, an HTTP 302 is returned by Google
Scholar::

    Our systems have detected unusual traffic from your computer
    network. Please try again later.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2025-10-06 10:12:38 +02:00 committed by GitHub
parent 34eb32f418
commit d8d5de4d47
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -27,6 +27,7 @@ import typing as t
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
from lxml import html from lxml import html
import httpx
from searx.utils import ( from searx.utils import (
eval_xpath, eval_xpath,
@ -36,7 +37,7 @@ from searx.utils import (
ElementType, ElementType,
) )
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import SearxEngineCaptchaException, SearxEngineAccessDeniedException
from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import ( from searx.engines.google import (
@ -97,6 +98,15 @@ def request(query: str, params: "OnlineParams") -> None:
def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
"""Parse response from Google Scholar""" """Parse response from Google Scholar"""
if resp.status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers:
if "/sorry/index?continue" in resp.headers["Location"]:
# Our systems have detected unusual traffic from your computer
# network. Please try again later.
raise SearxEngineAccessDeniedException(
message="google_scholar: unusual traffic detected",
)
raise httpx.TooManyRedirects(f"location {resp.headers['Location'].split('?')[0]}")
res = EngineResults() res = EngineResults()
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
detect_google_captcha(dom) detect_google_captcha(dom)
@ -192,7 +202,7 @@ def detect_google_captcha(dom: ElementType):
not redirected to ``sorry.google.com``. not redirected to ``sorry.google.com``.
""" """
if eval_xpath(dom, "//form[@id='gs_captcha_f']"): if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
raise SearxEngineCaptchaException() raise SearxEngineCaptchaException(message="CAPTCHA (gs_captcha_f)")
def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: