mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-04 03:27:06 -05:00 
			
		
		
		
	[fix] Get an actual sc argument from startpage's home page.
				
					
				
			Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									1cbcddb3f7
								
							
						
					
					
						commit
						2f4e567e90
					
				@ -5,6 +5,7 @@
 | 
				
			|||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					from time import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from urllib.parse import urlencode
 | 
					from urllib.parse import urlencode
 | 
				
			||||||
from unicodedata import normalize, combining
 | 
					from unicodedata import normalize, combining
 | 
				
			||||||
@ -15,6 +16,7 @@ from lxml import html
 | 
				
			|||||||
from babel import Locale
 | 
					from babel import Locale
 | 
				
			||||||
from babel.localedata import locale_identifiers
 | 
					from babel.localedata import locale_identifiers
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from searx import network
 | 
				
			||||||
from searx.utils import extract_text, eval_xpath, match_language
 | 
					from searx.utils import extract_text, eval_xpath, match_language
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# about
 | 
					# about
 | 
				
			||||||
@ -47,6 +49,41 @@ results_xpath = '//div[@class="w-gl__result__main"]'
 | 
				
			|||||||
link_xpath = './/a[@class="w-gl__result-title result-link"]'
 | 
					link_xpath = './/a[@class="w-gl__result-title result-link"]'
 | 
				
			||||||
content_xpath = './/p[@class="w-gl__description"]'
 | 
					content_xpath = './/p[@class="w-gl__description"]'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# timestamp of the last fetch of 'sc' code
 | 
				
			||||||
 | 
					sc_code_ts = 0
 | 
				
			||||||
 | 
					sc_code = ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_sc_code(headers):
 | 
				
			||||||
 | 
					    """Get an actual `sc` argument from startpage's home page.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Startpage puts a `sc` argument on every link.  Without this argument
 | 
				
			||||||
 | 
					    startpage considers the request is from a bot.  We do not know what is
 | 
				
			||||||
 | 
					    encoded in the value of the `sc` argument, but it seems to be a kind of a
 | 
				
			||||||
 | 
					    *time-stamp*.  This *time-stamp* is valid for a few hours.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This function scrap a new *time-stamp* from startpage's home page every hour
 | 
				
			||||||
 | 
					    (3000 sec).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    global sc_code_ts, sc_code  # pylint: disable=global-statement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if time() > (sc_code_ts + 3000):
 | 
				
			||||||
 | 
					        logger.debug("query new sc time-stamp ...")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        resp = network.get(base_url, headers=headers)
 | 
				
			||||||
 | 
					        dom = html.fromstring(resp.text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # href --> '/?sc=adrKJMgF8xwp20'
 | 
				
			||||||
 | 
					        href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        sc_code = href[5:]
 | 
				
			||||||
 | 
					        sc_code_ts = time()
 | 
				
			||||||
 | 
					        logger.debug("new value is: %s", sc_code)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return sc_code
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# do search-request
 | 
					# do search-request
 | 
				
			||||||
def request(query, params):
 | 
					def request(query, params):
 | 
				
			||||||
@ -56,7 +93,7 @@ def request(query, params):
 | 
				
			|||||||
        'page': params['pageno'],
 | 
					        'page': params['pageno'],
 | 
				
			||||||
        'cat': 'web',
 | 
					        'cat': 'web',
 | 
				
			||||||
        # 'abp': "-1",
 | 
					        # 'abp': "-1",
 | 
				
			||||||
        'sc': 'Mj4jZy61QETj20',
 | 
					        'sc': get_sc_code(params['headers']),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # set language if specified
 | 
					    # set language if specified
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user