mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 10:37:06 -04:00 
			
		
		
		
	[fix] google scholar - detect CAPTCHA (HTTP redirects) (#5268)
In the case of .. response, for example, an HTTP 302 is returned by Google
Scholar::
    Our systems have detected unusual traffic from your computer
    network. Please try again later.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
			
			
This commit is contained in:
		
							parent
							
								
									34eb32f418
								
							
						
					
					
						commit
						d8d5de4d47
					
				| @ -27,6 +27,7 @@ import typing as t | ||||
| from urllib.parse import urlencode | ||||
| from datetime import datetime | ||||
| from lxml import html | ||||
| import httpx | ||||
| 
 | ||||
| from searx.utils import ( | ||||
|     eval_xpath, | ||||
| @ -36,7 +37,7 @@ from searx.utils import ( | ||||
|     ElementType, | ||||
| ) | ||||
| 
 | ||||
| from searx.exceptions import SearxEngineCaptchaException | ||||
| from searx.exceptions import SearxEngineCaptchaException, SearxEngineAccessDeniedException | ||||
| 
 | ||||
| from searx.engines.google import fetch_traits  # pylint: disable=unused-import | ||||
| from searx.engines.google import ( | ||||
| @ -97,6 +98,15 @@ def request(query: str, params: "OnlineParams") -> None: | ||||
| def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals | ||||
|     """Parse response from Google Scholar""" | ||||
| 
 | ||||
|     if resp.status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers: | ||||
|         if "/sorry/index?continue" in resp.headers["Location"]: | ||||
|             # Our systems have detected unusual traffic from your computer | ||||
|             # network. Please try again later. | ||||
|             raise SearxEngineAccessDeniedException( | ||||
|                 message="google_scholar: unusual traffic detected", | ||||
|             ) | ||||
|         raise httpx.TooManyRedirects(f"location {resp.headers['Location'].split('?')[0]}") | ||||
| 
 | ||||
|     res = EngineResults() | ||||
|     dom = html.fromstring(resp.text) | ||||
|     detect_google_captcha(dom) | ||||
| @ -192,7 +202,7 @@ def detect_google_captcha(dom: ElementType): | ||||
|     not redirected to ``sorry.google.com``. | ||||
|     """ | ||||
|     if eval_xpath(dom, "//form[@id='gs_captcha_f']"): | ||||
|         raise SearxEngineCaptchaException() | ||||
|         raise SearxEngineCaptchaException(message="CAPTCHA (gs_captcha_f)") | ||||
| 
 | ||||
| 
 | ||||
| def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user