mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-03 19:17:07 -05:00 
			
		
		
		
	[refactor] duckduckgo engine: improve request logic and code structure (#4837)
Changes: - Add trailing slash to base URL to prevent potential redirects - Remove advanced search syntax filtering (no longer guarantees a CAPTCHA) - Correct pagination offset calculation: Page 2 now starts at offset 10, subsequent pages use 10 + (n-2)*15 formula instead of the previous broken 20 + (n-2)*50 calculation that caused CAPTCHAs - Restructure request parameter building to better match a real request - "kt" cookie is no longer an empty string if the language/region is "all" - Group related parameter assignments together - Add header logging to debugging output Related: - https://github.com/searxng/searxng/issues/4824
This commit is contained in:
		
							parent
							
								
									98badc9cd0
								
							
						
					
					
						commit
						4fa7de8033
					
				@ -58,7 +58,7 @@ paging = True
 | 
				
			|||||||
time_range_support = True
 | 
					time_range_support = True
 | 
				
			||||||
safesearch = True  # user can't select but the results are filtered
 | 
					safesearch = True  # user can't select but the results are filtered
 | 
				
			||||||
 | 
					
 | 
				
			||||||
url = "https://html.duckduckgo.com/html"
 | 
					url = "https://html.duckduckgo.com/html/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
 | 
					time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
 | 
				
			||||||
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
 | 
					form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
 | 
				
			||||||
@ -248,7 +248,6 @@ def quote_ddg_bangs(query):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def request(query, params):
 | 
					def request(query, params):
 | 
				
			||||||
 | 
					 | 
				
			||||||
    query = quote_ddg_bangs(query)
 | 
					    query = quote_ddg_bangs(query)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if len(query) >= 500:
 | 
					    if len(query) >= 500:
 | 
				
			||||||
@ -256,93 +255,79 @@ def request(query, params):
 | 
				
			|||||||
        params["url"] = None
 | 
					        params["url"] = None
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Advanced search syntax ends in CAPTCHA
 | 
					 | 
				
			||||||
    # https://duckduckgo.com/duckduckgo-help-pages/results/syntax/
 | 
					 | 
				
			||||||
    query = " ".join(
 | 
					 | 
				
			||||||
        [
 | 
					 | 
				
			||||||
            x.removeprefix("site:").removeprefix("intitle:").removeprefix("inurl:").removeprefix("filetype:")
 | 
					 | 
				
			||||||
            for x in query.split()
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
 | 
					    eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
 | 
				
			||||||
    if eng_region == "wt-wt":
 | 
					 | 
				
			||||||
        # https://html.duckduckgo.com/html sets an empty value for "all".
 | 
					 | 
				
			||||||
        eng_region = ""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    params['data']['kl'] = eng_region
 | 
					    # Note: The API is reverse-engineered from DuckDuckGo's HTML webpage
 | 
				
			||||||
    params['cookies']['kl'] = eng_region
 | 
					    # (https://html.duckduckgo.com/html/) and may be subject to additional bot detection mechanisms
 | 
				
			||||||
 | 
					    # and breaking changes in the future.
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    # The params['data'] dictionary can have the following key parameters, in this order:
 | 
				
			||||||
 | 
					    # - q (str): Search query string
 | 
				
			||||||
 | 
					    # - b (str): Beginning parameter - empty string for first page requests
 | 
				
			||||||
 | 
					    # - s (int): Search offset for pagination
 | 
				
			||||||
 | 
					    # - nextParams (str): Continuation parameters from previous page response, typically empty
 | 
				
			||||||
 | 
					    # - v (str): Typically 'l' for subsequent pages
 | 
				
			||||||
 | 
					    # - o (str): Output format, typically 'json'
 | 
				
			||||||
 | 
					    # - dc (int): Display count - value equal to offset (s) + 1
 | 
				
			||||||
 | 
					    # - api (str): API endpoint identifier, typically 'd.js'
 | 
				
			||||||
 | 
					    # - vqd (str): Validation query digest
 | 
				
			||||||
 | 
					    # - kl (str): Keyboard language/region code (e.g., 'en-us')
 | 
				
			||||||
 | 
					    # - df (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month), 'y' (year)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # eng_lang = get_ddg_lang(traits, params['searxng_locale'])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    params['url'] = url
 | 
					 | 
				
			||||||
    params['method'] = 'POST'
 | 
					 | 
				
			||||||
    params['data']['q'] = query
 | 
					    params['data']['q'] = query
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # The API is not documented, so we do some reverse engineering and emulate
 | 
					 | 
				
			||||||
    # what https://html.duckduckgo.com/html does when you press "next Page" link
 | 
					 | 
				
			||||||
    # again and again ..
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    params['headers']['Sec-Fetch-Dest'] = "document"
 | 
					 | 
				
			||||||
    params['headers']['Sec-Fetch-Mode'] = "navigate"  # at least this one is used by ddg's bot detection
 | 
					 | 
				
			||||||
    params['headers']['Sec-Fetch-Site'] = "same-origin"
 | 
					 | 
				
			||||||
    params['headers']['Sec-Fetch-User'] = "?1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Form of the initial search page does have empty values in the form
 | 
					 | 
				
			||||||
    if params['pageno'] == 1:
 | 
					    if params['pageno'] == 1:
 | 
				
			||||||
 | 
					 | 
				
			||||||
        params['data']['b'] = ""
 | 
					        params['data']['b'] = ""
 | 
				
			||||||
 | 
					    elif params['pageno'] >= 2:
 | 
				
			||||||
    params['data']['df'] = ''
 | 
					        offset = 10 + (params['pageno'] - 2) * 15  # Page 2 = 10, Page 3+ = 10 + n*15
 | 
				
			||||||
    if params['time_range'] in time_range_dict:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        params['data']['df'] = time_range_dict[params['time_range']]
 | 
					 | 
				
			||||||
        params['cookies']['df'] = time_range_dict[params['time_range']]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if params['pageno'] == 2:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # second page does have an offset of 20
 | 
					 | 
				
			||||||
        offset = (params['pageno'] - 1) * 20
 | 
					 | 
				
			||||||
        params['data']['s'] = offset
 | 
					        params['data']['s'] = offset
 | 
				
			||||||
        params['data']['dc'] = offset + 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    elif params['pageno'] > 2:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # third and following pages do have an offset of 20 + n*50
 | 
					 | 
				
			||||||
        offset = 20 + (params['pageno'] - 2) * 50
 | 
					 | 
				
			||||||
        params['data']['s'] = offset
 | 
					 | 
				
			||||||
        params['data']['dc'] = offset + 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if params['pageno'] > 1:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # initial page does not have these additional data in the input form
 | 
					 | 
				
			||||||
        params['data']['o'] = form_data.get('o', 'json')
 | 
					 | 
				
			||||||
        params['data']['api'] = form_data.get('api', 'd.js')
 | 
					 | 
				
			||||||
        params['data']['nextParams'] = form_data.get('nextParams', '')
 | 
					        params['data']['nextParams'] = form_data.get('nextParams', '')
 | 
				
			||||||
        params['data']['v'] = form_data.get('v', 'l')
 | 
					        params['data']['v'] = form_data.get('v', 'l')
 | 
				
			||||||
        params['headers']['Referer'] = url
 | 
					        params['data']['o'] = form_data.get('o', 'json')
 | 
				
			||||||
 | 
					        params['data']['dc'] = offset + 1
 | 
				
			||||||
 | 
					        params['data']['api'] = form_data.get('api', 'd.js')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # vqd is required to request other pages after the first one
 | 
				
			||||||
        vqd = get_vqd(query, eng_region, force_request=False)
 | 
					        vqd = get_vqd(query, eng_region, force_request=False)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Certain conditions must be met in order to call up one of the
 | 
					 | 
				
			||||||
        # following pages ...
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if vqd:
 | 
					        if vqd:
 | 
				
			||||||
            params['data']['vqd'] = vqd  # follow up pages / requests needs a vqd argument
 | 
					            params['data']['vqd'] = vqd
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            # Don't try to call follow up pages without a vqd value.  DDG
 | 
					            # Don't try to call follow up pages without a vqd value.
 | 
				
			||||||
            # recognizes this as a request from a bot.  This lowers the
 | 
					            # DDG recognizes this as a request from a bot. This lowers the
 | 
				
			||||||
            # reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
 | 
					            # reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
 | 
				
			||||||
            params["url"] = None
 | 
					            params["url"] = None
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if params['searxng_locale'].startswith("zh"):
 | 
					        if params['searxng_locale'].startswith("zh"):
 | 
				
			||||||
            # Some locales (at least China) do not have a "next page" button and ddg
 | 
					            # Some locales (at least China) do not have a "next page" button and DDG
 | 
				
			||||||
            # will return a HTTP/2 403 Forbidden for a request of such a page.
 | 
					            # will return a HTTP/2 403 Forbidden for a request of such a page.
 | 
				
			||||||
            params["url"] = None
 | 
					            params["url"] = None
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Put empty kl in form data if language/region set to all
 | 
				
			||||||
 | 
					    if eng_region == "wt-wt":
 | 
				
			||||||
 | 
					        params['data']['kl'] = ""
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        params['data']['kl'] = eng_region
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    params['data']['df'] = ''
 | 
				
			||||||
 | 
					    if params['time_range'] in time_range_dict:
 | 
				
			||||||
 | 
					        params['data']['df'] = time_range_dict[params['time_range']]
 | 
				
			||||||
 | 
					        params['cookies']['df'] = time_range_dict[params['time_range']]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    params['cookies']['kl'] = eng_region
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    params['url'] = url
 | 
				
			||||||
 | 
					    params['method'] = 'POST'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
 | 
				
			||||||
 | 
					    params['headers']['Referer'] = url
 | 
				
			||||||
 | 
					    params['headers']['Sec-Fetch-Dest'] = "document"
 | 
				
			||||||
 | 
					    params['headers']['Sec-Fetch-Mode'] = "navigate"  # at least this one is used by ddg's bot detection
 | 
				
			||||||
 | 
					    params['headers']['Sec-Fetch-Site'] = "same-origin"
 | 
				
			||||||
 | 
					    params['headers']['Sec-Fetch-User'] = "?1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    logger.debug("param headers: %s", params['headers'])
 | 
				
			||||||
    logger.debug("param data: %s", params['data'])
 | 
					    logger.debug("param data: %s", params['data'])
 | 
				
			||||||
    logger.debug("param cookies: %s", params['cookies'])
 | 
					    logger.debug("param cookies: %s", params['cookies'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user