mirror of
https://github.com/searxng/searxng.git
synced 2025-05-24 01:12:56 -04:00
[refactor] duckduckgo engine: improve request logic and code structure (#4837)
Changes: - Add trailing slash to base URL to prevent potential redirects - Remove advanced search syntax filtering (no longer guarantees a CAPTCHA) - Correct pagination offset calculation: Page 2 now starts at offset 10, subsequent pages use 10 + (n-2)*15 formula instead of the previous broken 20 + (n-2)*50 calculation that caused CAPTCHAs - Restructure request parameter building to better match a real request - "kt" cookie is no longer an empty string if the language/region is "all" - Group related parameter assignments together - Add header logging to debugging output Related: - https://github.com/searxng/searxng/issues/4824
This commit is contained in:
parent
98badc9cd0
commit
4fa7de8033
@ -58,7 +58,7 @@ paging = True
|
|||||||
time_range_support = True
|
time_range_support = True
|
||||||
safesearch = True # user can't select but the results are filtered
|
safesearch = True # user can't select but the results are filtered
|
||||||
|
|
||||||
url = "https://html.duckduckgo.com/html"
|
url = "https://html.duckduckgo.com/html/"
|
||||||
|
|
||||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||||
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
|
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
|
||||||
@ -248,7 +248,6 @@ def quote_ddg_bangs(query):
|
|||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
|
||||||
query = quote_ddg_bangs(query)
|
query = quote_ddg_bangs(query)
|
||||||
|
|
||||||
if len(query) >= 500:
|
if len(query) >= 500:
|
||||||
@ -256,93 +255,79 @@ def request(query, params):
|
|||||||
params["url"] = None
|
params["url"] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
# Advanced search syntax ends in CAPTCHA
|
|
||||||
# https://duckduckgo.com/duckduckgo-help-pages/results/syntax/
|
|
||||||
query = " ".join(
|
|
||||||
[
|
|
||||||
x.removeprefix("site:").removeprefix("intitle:").removeprefix("inurl:").removeprefix("filetype:")
|
|
||||||
for x in query.split()
|
|
||||||
]
|
|
||||||
)
|
|
||||||
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||||
if eng_region == "wt-wt":
|
|
||||||
# https://html.duckduckgo.com/html sets an empty value for "all".
|
|
||||||
eng_region = ""
|
|
||||||
|
|
||||||
params['data']['kl'] = eng_region
|
# Note: The API is reverse-engineered from DuckDuckGo's HTML webpage
|
||||||
params['cookies']['kl'] = eng_region
|
# (https://html.duckduckgo.com/html/) and may be subject to additional bot detection mechanisms
|
||||||
|
# and breaking changes in the future.
|
||||||
|
#
|
||||||
|
# The params['data'] dictionary can have the following key parameters, in this order:
|
||||||
|
# - q (str): Search query string
|
||||||
|
# - b (str): Beginning parameter - empty string for first page requests
|
||||||
|
# - s (int): Search offset for pagination
|
||||||
|
# - nextParams (str): Continuation parameters from previous page response, typically empty
|
||||||
|
# - v (str): Typically 'l' for subsequent pages
|
||||||
|
# - o (str): Output format, typically 'json'
|
||||||
|
# - dc (int): Display count - value equal to offset (s) + 1
|
||||||
|
# - api (str): API endpoint identifier, typically 'd.js'
|
||||||
|
# - vqd (str): Validation query digest
|
||||||
|
# - kl (str): Keyboard language/region code (e.g., 'en-us')
|
||||||
|
# - df (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month), 'y' (year)
|
||||||
|
|
||||||
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
|
||||||
|
|
||||||
params['url'] = url
|
|
||||||
params['method'] = 'POST'
|
|
||||||
params['data']['q'] = query
|
params['data']['q'] = query
|
||||||
|
|
||||||
# The API is not documented, so we do some reverse engineering and emulate
|
|
||||||
# what https://html.duckduckgo.com/html does when you press "next Page" link
|
|
||||||
# again and again ..
|
|
||||||
|
|
||||||
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
|
|
||||||
|
|
||||||
params['headers']['Sec-Fetch-Dest'] = "document"
|
|
||||||
params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
|
|
||||||
params['headers']['Sec-Fetch-Site'] = "same-origin"
|
|
||||||
params['headers']['Sec-Fetch-User'] = "?1"
|
|
||||||
|
|
||||||
# Form of the initial search page does have empty values in the form
|
|
||||||
if params['pageno'] == 1:
|
if params['pageno'] == 1:
|
||||||
|
|
||||||
params['data']['b'] = ""
|
params['data']['b'] = ""
|
||||||
|
elif params['pageno'] >= 2:
|
||||||
params['data']['df'] = ''
|
offset = 10 + (params['pageno'] - 2) * 15 # Page 2 = 10, Page 3+ = 10 + n*15
|
||||||
if params['time_range'] in time_range_dict:
|
|
||||||
|
|
||||||
params['data']['df'] = time_range_dict[params['time_range']]
|
|
||||||
params['cookies']['df'] = time_range_dict[params['time_range']]
|
|
||||||
|
|
||||||
if params['pageno'] == 2:
|
|
||||||
|
|
||||||
# second page does have an offset of 20
|
|
||||||
offset = (params['pageno'] - 1) * 20
|
|
||||||
params['data']['s'] = offset
|
params['data']['s'] = offset
|
||||||
params['data']['dc'] = offset + 1
|
|
||||||
|
|
||||||
elif params['pageno'] > 2:
|
|
||||||
|
|
||||||
# third and following pages do have an offset of 20 + n*50
|
|
||||||
offset = 20 + (params['pageno'] - 2) * 50
|
|
||||||
params['data']['s'] = offset
|
|
||||||
params['data']['dc'] = offset + 1
|
|
||||||
|
|
||||||
if params['pageno'] > 1:
|
|
||||||
|
|
||||||
# initial page does not have these additional data in the input form
|
|
||||||
params['data']['o'] = form_data.get('o', 'json')
|
|
||||||
params['data']['api'] = form_data.get('api', 'd.js')
|
|
||||||
params['data']['nextParams'] = form_data.get('nextParams', '')
|
params['data']['nextParams'] = form_data.get('nextParams', '')
|
||||||
params['data']['v'] = form_data.get('v', 'l')
|
params['data']['v'] = form_data.get('v', 'l')
|
||||||
params['headers']['Referer'] = url
|
params['data']['o'] = form_data.get('o', 'json')
|
||||||
|
params['data']['dc'] = offset + 1
|
||||||
|
params['data']['api'] = form_data.get('api', 'd.js')
|
||||||
|
|
||||||
|
# vqd is required to request other pages after the first one
|
||||||
vqd = get_vqd(query, eng_region, force_request=False)
|
vqd = get_vqd(query, eng_region, force_request=False)
|
||||||
|
|
||||||
# Certain conditions must be met in order to call up one of the
|
|
||||||
# following pages ...
|
|
||||||
|
|
||||||
if vqd:
|
if vqd:
|
||||||
params['data']['vqd'] = vqd # follow up pages / requests needs a vqd argument
|
params['data']['vqd'] = vqd
|
||||||
else:
|
else:
|
||||||
# Don't try to call follow up pages without a vqd value. DDG
|
# Don't try to call follow up pages without a vqd value.
|
||||||
# recognizes this as a request from a bot. This lowers the
|
# DDG recognizes this as a request from a bot. This lowers the
|
||||||
# reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
|
# reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
|
||||||
params["url"] = None
|
params["url"] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
if params['searxng_locale'].startswith("zh"):
|
if params['searxng_locale'].startswith("zh"):
|
||||||
# Some locales (at least China) do not have a "next page" button and ddg
|
# Some locales (at least China) do not have a "next page" button and DDG
|
||||||
# will return a HTTP/2 403 Forbidden for a request of such a page.
|
# will return a HTTP/2 403 Forbidden for a request of such a page.
|
||||||
params["url"] = None
|
params["url"] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Put empty kl in form data if language/region set to all
|
||||||
|
if eng_region == "wt-wt":
|
||||||
|
params['data']['kl'] = ""
|
||||||
|
else:
|
||||||
|
params['data']['kl'] = eng_region
|
||||||
|
|
||||||
|
params['data']['df'] = ''
|
||||||
|
if params['time_range'] in time_range_dict:
|
||||||
|
params['data']['df'] = time_range_dict[params['time_range']]
|
||||||
|
params['cookies']['df'] = time_range_dict[params['time_range']]
|
||||||
|
|
||||||
|
params['cookies']['kl'] = eng_region
|
||||||
|
|
||||||
|
params['url'] = url
|
||||||
|
params['method'] = 'POST'
|
||||||
|
|
||||||
|
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||||
|
params['headers']['Referer'] = url
|
||||||
|
params['headers']['Sec-Fetch-Dest'] = "document"
|
||||||
|
params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
|
||||||
|
params['headers']['Sec-Fetch-Site'] = "same-origin"
|
||||||
|
params['headers']['Sec-Fetch-User'] = "?1"
|
||||||
|
|
||||||
|
logger.debug("param headers: %s", params['headers'])
|
||||||
logger.debug("param data: %s", params['data'])
|
logger.debug("param data: %s", params['data'])
|
||||||
logger.debug("param cookies: %s", params['cookies'])
|
logger.debug("param cookies: %s", params['cookies'])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user