searxng/searx/engines/mullvad_leta.py
Markus Heiser e840e3f960 [fix] engine mullvadleta - ignore HTTP 403 & 429 response
It doesn't matter if you're using Mullvad's VPN and a proper browser, you'll
still get blocked for specific searches [1] with a 403 or 429 HTTP status code.
Mullvad only blocks the search request and doesn't prevent you from doing more
searches.

The logic should handle the blocked requests (403, 429), but not put the engine
on a cooldown.

[1] https://leta.mullvad.net/search?q=site%3Afoo+bar&engine=brave

Closes: https://github.com/searxng/searxng/issues/5328
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-10-18 09:05:54 +02:00

277 lines
8.1 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Mullvad Leta is a search engine proxy. Currently Leta only offers text
search results not image, news or any other types of search result. Leta acts
as a proxy to Google and Brave search results. You can select which backend
search engine you wish to use, see (:py:obj:`leta_engine`).
.. hint::
Leta caches each search for up to 30 days. For example, if you use search
terms like ``news``, contrary to your intention you'll get very old results!
Configuration
=============
The engine has the following additional settings:
- :py:obj:`leta_engine` (:py:obj:`LetaEnginesType`)
You can configure one Leta engine for Google and one for Brave:
.. code:: yaml
- name: mullvadleta
engine: mullvad_leta
leta_engine: google
shortcut: ml
- name: mullvadleta brave
engine: mullvad_leta
network: mullvadleta # use network from engine "mullvadleta" configured above
leta_engine: brave
shortcut: mlb
Implementations
===============
"""
import typing as t
from urllib.parse import urlencode
import babel
from httpx import Response
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.extended_types import SXNG_Response
from searx.locales import get_official_locales, language_tag, region_tag
from searx.utils import eval_xpath_list
from searx.result_types import EngineResults, MainResult
from searx.network import raise_for_httperror
search_url = "https://leta.mullvad.net"
# about
about = {
"website": search_url,
"wikidata_id": 'Q47008412', # the Mullvad id - not leta, but related
"official_api_documentation": 'https://leta.mullvad.net/faq',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ["general", "web"]
paging = True
max_page = 10
time_range_support = True
time_range_dict = {
"day": "d",
"week": "w",
"month": "m",
"year": "y",
}
LetaEnginesType = t.Literal["google", "brave"]
"""Engine types supported by mullvadleta."""
leta_engine: LetaEnginesType = "google"
"""Select Leta's engine type from :py:obj:`LetaEnginesType`."""
def init(_):
l = t.get_args(LetaEnginesType)
if leta_engine not in l:
raise ValueError(f"leta_engine '{leta_engine}' is invalid, use one of {', '.join(l)}")
class DataNodeQueryMetaDataIndices(t.TypedDict):
"""Indices into query metadata."""
success: int
q: int # pylint: disable=invalid-name
country: int
language: int
lastUpdated: int
engine: int
items: int
infobox: int
news: int
timestamp: int
altered: int
page: int
next: int # if -1, there no more results are available
previous: int
class DataNodeResultIndices(t.TypedDict):
"""Indices into query resultsdata."""
link: int
snippet: int
title: int
favicon: int
def request(query: str, params: dict[str, t.Any]) -> None:
params["raise_for_httperror"] = False
params["method"] = "GET"
args = {
"q": query,
"engine": leta_engine,
"x-sveltekit-invalidated": "001", # hardcoded from all requests seen
}
country = traits.get_region(params.get("searxng_locale"), traits.all_locale) # type: ignore
if country:
args["country"] = country
language = traits.get_language(params.get("searxng_locale"), traits.all_locale) # type: ignore
if language:
args["language"] = language
if params["time_range"] in time_range_dict:
args["lastUpdated"] = time_range_dict[params["time_range"]]
if params["pageno"] > 1:
args["page"] = params["pageno"]
params["url"] = f"{search_url}/search/__data.json?{urlencode(args)}"
def response(resp: SXNG_Response) -> EngineResults:
results = EngineResults()
if resp.status_code in (403, 429):
# It doesn't matter if you're using Mullvad's VPN and a proper browser,
# you'll still get blocked for specific searches with a 403 or 429 HTTP
# status code.
# https://github.com/searxng/searxng/issues/5328#issue-3518337233
return results
# raise for other errors
raise_for_httperror(resp)
json_response = resp.json()
nodes = json_response["nodes"]
# 0: is None
# 1: has "connected=True", not useful
# 2: query results within "data"
data_nodes = nodes[2]["data"]
# Instead of nested object structure, all objects are flattened into a
# list. Rather, the first object in data_node provides indices into the
# "data_nodes" to access each searchresult (which is an object of more
# indices)
#
# Read the relative TypedDict definitions for details
query_meta_data: DataNodeQueryMetaDataIndices = data_nodes[0]
query_items_indices = query_meta_data["items"]
for idx in data_nodes[query_items_indices]:
query_item_indices: DataNodeResultIndices = data_nodes[idx]
results.add(
MainResult(
url=data_nodes[query_item_indices["link"]],
title=data_nodes[query_item_indices["title"]],
content=data_nodes[query_item_indices["snippet"]],
)
)
return results
def fetch_traits(engine_traits: EngineTraits) -> None:
"""Fetch languages and regions from Mullvad-Leta"""
def extract_table_data(table):
for row in table.xpath(".//tr")[2:]:
cells = row.xpath(".//td | .//th") # includes headers and data
if len(cells) > 1: # ensure the column exists
cell0 = cells[0].text_content().strip()
cell1 = cells[1].text_content().strip()
yield [cell0, cell1]
# pylint: disable=import-outside-toplevel
# see https://github.com/searxng/searxng/issues/762
from searx.network import get as http_get
# pylint: enable=import-outside-toplevel
resp = http_get(f"{search_url}/documentation")
if not isinstance(resp, Response):
print("ERROR: failed to get response from mullvad-leta. Are you connected to the VPN?")
return
if not resp.ok:
print("ERROR: response from mullvad-leta is not OK. Are you connected to the VPN?")
return
dom = html.fromstring(resp.text)
# There are 4 HTML tables on the documentation page for extracting information:
# 0. Keyboard Shortcuts
# 1. Query Parameters (shoutout to Mullvad for accessible docs for integration)
# 2. Country Codes [Country, Code]
# 3. Language Codes [Language, Code]
tables = eval_xpath_list(dom.body, "//table")
if tables is None or len(tables) <= 0:
print("ERROR: could not find any tables. Was the page updated?")
language_table = tables[3]
lang_map = {
"zh-hant": "zh_Hans",
"zh-hans": "zh_Hant",
"jp": "ja",
}
for language, code in extract_table_data(language_table):
locale_tag = lang_map.get(code, code).replace("-", "_") # type: ignore
try:
locale = babel.Locale.parse(locale_tag)
except babel.UnknownLocaleError:
print(f"ERROR: Mullvad-Leta language {language} ({code}) is unknown by babel")
continue
sxng_tag = language_tag(locale)
engine_traits.languages[sxng_tag] = code
country_table = tables[2]
country_map = {
"cn": "zh-CN",
"hk": "zh-HK",
"jp": "ja-JP",
"my": "ms-MY",
"tw": "zh-TW",
"uk": "en-GB",
"us": "en-US",
}
for country, code in extract_table_data(country_table):
sxng_tag = country_map.get(code)
if sxng_tag:
engine_traits.regions[sxng_tag] = code
continue
try:
locale = babel.Locale.parse(f"{code.lower()}_{code.upper()}")
except babel.UnknownLocaleError:
locale = None
if locale:
engine_traits.regions[region_tag(locale)] = code
continue
official_locales = get_official_locales(code, engine_traits.languages.keys(), regional=True)
if not official_locales:
print(f"ERROR: Mullvad-Leta country '{code}' ({country}) could not be mapped as expected.")
continue
for locale in official_locales:
engine_traits.regions[region_tag(locale)] = code