mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 10:37:06 -04:00 
			
		
		
		
	[fix] google scholar - detect CAPTCHA (HTTP redirects) (#5268)
In the case of .. response, for example, an HTTP 302 is returned by Google
Scholar::
    Our systems have detected unusual traffic from your computer
    network. Please try again later.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
			
			
This commit is contained in:
		
							parent
							
								
									34eb32f418
								
							
						
					
					
						commit
						d8d5de4d47
					
				| @ -27,6 +27,7 @@ import typing as t | |||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from lxml import html | from lxml import html | ||||||
|  | import httpx | ||||||
| 
 | 
 | ||||||
| from searx.utils import ( | from searx.utils import ( | ||||||
|     eval_xpath, |     eval_xpath, | ||||||
| @ -36,7 +37,7 @@ from searx.utils import ( | |||||||
|     ElementType, |     ElementType, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| from searx.exceptions import SearxEngineCaptchaException | from searx.exceptions import SearxEngineCaptchaException, SearxEngineAccessDeniedException | ||||||
| 
 | 
 | ||||||
| from searx.engines.google import fetch_traits  # pylint: disable=unused-import | from searx.engines.google import fetch_traits  # pylint: disable=unused-import | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
| @ -97,6 +98,15 @@ def request(query: str, params: "OnlineParams") -> None: | |||||||
| def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals | def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals | ||||||
|     """Parse response from Google Scholar""" |     """Parse response from Google Scholar""" | ||||||
| 
 | 
 | ||||||
|  |     if resp.status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers: | ||||||
|  |         if "/sorry/index?continue" in resp.headers["Location"]: | ||||||
|  |             # Our systems have detected unusual traffic from your computer | ||||||
|  |             # network. Please try again later. | ||||||
|  |             raise SearxEngineAccessDeniedException( | ||||||
|  |                 message="google_scholar: unusual traffic detected", | ||||||
|  |             ) | ||||||
|  |         raise httpx.TooManyRedirects(f"location {resp.headers['Location'].split('?')[0]}") | ||||||
|  | 
 | ||||||
|     res = EngineResults() |     res = EngineResults() | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|     detect_google_captcha(dom) |     detect_google_captcha(dom) | ||||||
| @ -192,7 +202,7 @@ def detect_google_captcha(dom: ElementType): | |||||||
|     not redirected to ``sorry.google.com``. |     not redirected to ``sorry.google.com``. | ||||||
|     """ |     """ | ||||||
|     if eval_xpath(dom, "//form[@id='gs_captcha_f']"): |     if eval_xpath(dom, "//form[@id='gs_captcha_f']"): | ||||||
|         raise SearxEngineCaptchaException() |         raise SearxEngineCaptchaException(message="CAPTCHA (gs_captcha_f)") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: | def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user