mirror of
https://github.com/benbusby/whoogle-search.git
synced 2025-10-18 04:21:26 -04:00
This commit introduces support for the Mullvad Leta search backend, which is now enabled by default. It includes a new conversion function to transform Leta's search results into a format compatible with Whoogle. The README has been updated to reflect this change, detailing the limitations of the Leta backend and how to switch back to Google if needed. Additionally, the configuration model has been updated to include a setting for enabling/disabling the Leta backend.
461 lines
17 KiB
Python
461 lines
17 KiB
Python
from app.models.config import Config
|
|
from app.utils.misc import read_config_bool
|
|
from app.services.provider import get_http_client
|
|
from datetime import datetime
|
|
from defusedxml import ElementTree as ET
|
|
import random
|
|
import httpx
|
|
import urllib.parse as urlparse
|
|
import os
|
|
from stem import Signal, SocketError
|
|
from stem.connection import AuthenticationFailure
|
|
from stem.control import Controller
|
|
from stem.connection import authenticate_cookie, authenticate_password
|
|
|
|
MAPS_URL = 'https://maps.google.com/maps'
|
|
AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/'
|
|
'complete/search?client=toolbar&')
|
|
|
|
MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
|
|
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
|
|
|
|
# Valid query params
|
|
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr']
|
|
|
|
|
|
class TorError(Exception):
|
|
"""Exception raised for errors in Tor requests.
|
|
|
|
Attributes:
|
|
message: a message describing the error that occurred
|
|
disable: optionally disables Tor in the user config (note:
|
|
this should only happen if the connection has been dropped
|
|
altogether).
|
|
"""
|
|
|
|
def __init__(self, message, disable=False) -> None:
|
|
self.message = message
|
|
self.disable = disable
|
|
super().__init__(message)
|
|
|
|
|
|
def send_tor_signal(signal: Signal) -> bool:
|
|
use_pass = read_config_bool('WHOOGLE_TOR_USE_PASS')
|
|
|
|
confloc = './misc/tor/control.conf'
|
|
# Check that the custom location of conf is real.
|
|
temp = os.getenv('WHOOGLE_TOR_CONF', '')
|
|
if os.path.isfile(temp):
|
|
confloc = temp
|
|
|
|
# Attempt to authenticate and send signal.
|
|
try:
|
|
with Controller.from_port(port=9051) as c:
|
|
if use_pass:
|
|
with open(confloc, "r") as conf:
|
|
# Scan for the last line of the file.
|
|
for line in conf:
|
|
pass
|
|
secret = line.strip('\n')
|
|
authenticate_password(c, password=secret)
|
|
else:
|
|
cookie_path = '/var/lib/tor/control_auth_cookie'
|
|
authenticate_cookie(c, cookie_path=cookie_path)
|
|
c.signal(signal)
|
|
os.environ['TOR_AVAILABLE'] = '1'
|
|
return True
|
|
except (SocketError, AuthenticationFailure,
|
|
ConnectionRefusedError, ConnectionError):
|
|
# TODO: Handle Tor authentication (password and cookie)
|
|
os.environ['TOR_AVAILABLE'] = '0'
|
|
|
|
return False
|
|
|
|
|
|
def gen_user_agent(config, is_mobile) -> str:
|
|
# Define the default PlayStation Portable user agent (replaces Lynx)
|
|
DEFAULT_UA = 'Mozilla/4.0 (PSP (PlayStation Portable); 2.00)'
|
|
|
|
# If using custom user agent, return the custom string
|
|
if config.user_agent == 'custom' and config.custom_user_agent:
|
|
return config.custom_user_agent
|
|
|
|
# If using environment configuration
|
|
if config.user_agent == 'env_conf':
|
|
if is_mobile:
|
|
env_ua = os.getenv('WHOOGLE_USER_AGENT_MOBILE', '')
|
|
if env_ua:
|
|
return env_ua
|
|
else:
|
|
env_ua = os.getenv('WHOOGLE_USER_AGENT', '')
|
|
if env_ua:
|
|
return env_ua
|
|
# If env vars are not set, fall back to default
|
|
return DEFAULT_UA
|
|
|
|
# If using default user agent
|
|
if config.user_agent == 'default':
|
|
return DEFAULT_UA
|
|
|
|
# If no custom user agent is set, generate a random one (for backwards compatibility)
|
|
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
|
|
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
|
|
|
|
if is_mobile:
|
|
return MOBILE_UA.format("Mozilla", firefox)
|
|
|
|
return DESKTOP_UA.format("Mozilla", linux, firefox)
|
|
|
|
|
|
def gen_query_leta(query, args, config) -> str:
|
|
"""Builds a query string for Mullvad Leta backend
|
|
|
|
Args:
|
|
query: The search query string
|
|
args: Request arguments
|
|
config: User configuration
|
|
|
|
Returns:
|
|
str: A formatted query string for Leta
|
|
"""
|
|
# Ensure search query is parsable
|
|
query = urlparse.quote(query)
|
|
|
|
# Build query starting with 'q='
|
|
query_str = 'q=' + query
|
|
|
|
# Always use Google as the engine (Leta supports 'google' or 'brave')
|
|
query_str += '&engine=google'
|
|
|
|
# Add country if configured
|
|
if config.country:
|
|
query_str += '&country=' + config.country.lower()
|
|
|
|
# Add language if configured
|
|
# Convert from Google's lang format (lang_en) to Leta's format (en)
|
|
if config.lang_search:
|
|
lang_code = config.lang_search.replace('lang_', '')
|
|
query_str += '&language=' + lang_code
|
|
|
|
# Handle time period filtering with :past syntax or tbs parameter
|
|
if ':past' in query:
|
|
time_range = str.strip(query.split(':past', 1)[-1]).lower()
|
|
if time_range.startswith('day'):
|
|
query_str += '&lastUpdated=d'
|
|
elif time_range.startswith('week'):
|
|
query_str += '&lastUpdated=w'
|
|
elif time_range.startswith('month'):
|
|
query_str += '&lastUpdated=m'
|
|
elif time_range.startswith('year'):
|
|
query_str += '&lastUpdated=y'
|
|
elif 'tbs' in args or 'tbs' in config:
|
|
result_tbs = args.get('tbs') if 'tbs' in args else config.tbs
|
|
# Convert Google's tbs format to Leta's lastUpdated format
|
|
if result_tbs and 'qdr:d' in result_tbs:
|
|
query_str += '&lastUpdated=d'
|
|
elif result_tbs and 'qdr:w' in result_tbs:
|
|
query_str += '&lastUpdated=w'
|
|
elif result_tbs and 'qdr:m' in result_tbs:
|
|
query_str += '&lastUpdated=m'
|
|
elif result_tbs and 'qdr:y' in result_tbs:
|
|
query_str += '&lastUpdated=y'
|
|
|
|
# Add pagination if present
|
|
if 'start' in args:
|
|
start = int(args.get('start', '0'))
|
|
# Leta uses 1-indexed pages, Google uses result offset
|
|
page = (start // 10) + 1
|
|
if page > 1:
|
|
query_str += '&page=' + str(page)
|
|
|
|
return query_str
|
|
|
|
|
|
def gen_query(query, args, config) -> str:
|
|
# If using Leta backend, build query differently
|
|
if config.use_leta:
|
|
return gen_query_leta(query, args, config)
|
|
|
|
param_dict = {key: '' for key in VALID_PARAMS}
|
|
|
|
# Use :past(hour/day/week/month/year) if available
|
|
# example search "new restaurants :past month"
|
|
lang = ''
|
|
if ':past' in query and 'tbs' not in args:
|
|
time_range = str.strip(query.split(':past', 1)[-1])
|
|
param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0]))
|
|
elif 'tbs' in args or 'tbs' in config:
|
|
result_tbs = args.get('tbs') if 'tbs' in args else config['tbs']
|
|
param_dict['tbs'] = '&tbs=' + result_tbs
|
|
|
|
# Occasionally the 'tbs' param provided by google also contains a
|
|
# field for 'lr', but formatted strangely. This is a rough solution
|
|
# for this.
|
|
#
|
|
# Example:
|
|
# &tbs=qdr:h,lr:lang_1pl
|
|
# -- the lr param needs to be extracted and remove the leading '1'
|
|
result_params = [_ for _ in result_tbs.split(',') if 'lr:' in _]
|
|
if len(result_params) > 0:
|
|
result_param = result_params[0]
|
|
lang = result_param[result_param.find('lr:') + 3:len(result_param)]
|
|
|
|
# Ensure search query is parsable
|
|
query = urlparse.quote(query)
|
|
|
|
# Pass along type of results (news, images, books, etc)
|
|
if 'tbm' in args:
|
|
param_dict['tbm'] = '&tbm=' + args.get('tbm')
|
|
|
|
# Get results page start value (10 per page, ie page 2 start val = 20)
|
|
if 'start' in args:
|
|
param_dict['start'] = '&start=' + args.get('start')
|
|
|
|
# Search for results near a particular city, if available
|
|
if config.near:
|
|
param_dict['near'] = '&near=' + urlparse.quote(config.near)
|
|
|
|
# Set language for results (lr) if source isn't set, otherwise use the
|
|
# result language param provided in the results
|
|
if 'source' in args:
|
|
param_dict['source'] = '&source=' + args.get('source')
|
|
param_dict['lr'] = ('&lr=' + ''.join(
|
|
[_ for _ in lang if not _.isdigit()]
|
|
)) if lang else ''
|
|
else:
|
|
param_dict['lr'] = (
|
|
'&lr=' + config.lang_search
|
|
) if config.lang_search else ''
|
|
|
|
# 'nfpr' defines the exclusion of results from an auto-corrected query
|
|
if 'nfpr' in args:
|
|
param_dict['nfpr'] = '&nfpr=' + args.get('nfpr')
|
|
|
|
# 'chips' is used in image tabs to pass the optional 'filter' to add to the
|
|
# given search term
|
|
if 'chips' in args:
|
|
param_dict['chips'] = '&chips=' + args.get('chips')
|
|
|
|
param_dict['gl'] = (
|
|
'&gl=' + config.country
|
|
) if config.country else ''
|
|
param_dict['hl'] = (
|
|
'&hl=' + config.lang_interface.replace('lang_', '')
|
|
) if config.lang_interface else ''
|
|
param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')
|
|
|
|
# Block all sites specified in the user config
|
|
unquoted_query = urlparse.unquote(query)
|
|
for blocked_site in config.block.replace(' ', '').split(','):
|
|
if not blocked_site:
|
|
continue
|
|
block = (' -site:' + blocked_site)
|
|
query += block if block not in unquoted_query else ''
|
|
|
|
for val in param_dict.values():
|
|
if not val:
|
|
continue
|
|
query += val
|
|
|
|
return query
|
|
|
|
|
|
class Request:
|
|
"""Class used for handling all outbound requests, including search queries,
|
|
search suggestions, and loading of external content (images, audio, etc).
|
|
|
|
Attributes:
|
|
normal_ua: the user's current user agent
|
|
root_path: the root path of the whoogle instance
|
|
config: the user's current whoogle configuration
|
|
"""
|
|
|
|
def __init__(self, normal_ua, root_path, config: Config, http_client=None):
|
|
# Use Leta backend if configured, otherwise use Google
|
|
if config.use_leta:
|
|
self.search_url = 'https://leta.mullvad.net/search?'
|
|
self.use_leta = True
|
|
else:
|
|
self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
|
|
os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&'
|
|
self.use_leta = False
|
|
|
|
# Optionally send heartbeat to Tor to determine availability
|
|
# Only when Tor is enabled in config to avoid unnecessary socket usage
|
|
if config.tor:
|
|
send_tor_signal(Signal.HEARTBEAT)
|
|
|
|
self.language = config.lang_search if config.lang_search else ''
|
|
self.country = config.country if config.country else ''
|
|
|
|
# For setting Accept-language Header
|
|
self.lang_interface = ''
|
|
if config.accept_language:
|
|
self.lang_interface = config.lang_interface
|
|
|
|
self.mobile = bool(normal_ua) and ('Android' in normal_ua
|
|
or 'iPhone' in normal_ua)
|
|
|
|
# Generate user agent based on config
|
|
self.modified_user_agent = gen_user_agent(config, self.mobile)
|
|
if not self.mobile:
|
|
self.modified_user_agent_mobile = gen_user_agent(config, True)
|
|
|
|
# Set up proxy configuration
|
|
proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
|
|
if proxy_path:
|
|
proxy_type = os.environ.get('WHOOGLE_PROXY_TYPE', '')
|
|
proxy_user = os.environ.get('WHOOGLE_PROXY_USER', '')
|
|
proxy_pass = os.environ.get('WHOOGLE_PROXY_PASS', '')
|
|
auth_str = ''
|
|
if proxy_user:
|
|
auth_str = f'{proxy_user}:{proxy_pass}@'
|
|
|
|
proxy_str = f'{proxy_type}://{auth_str}{proxy_path}'
|
|
self.proxies = {
|
|
'https': proxy_str,
|
|
'http': proxy_str
|
|
}
|
|
else:
|
|
self.proxies = {
|
|
'http': 'socks5://127.0.0.1:9050',
|
|
'https': 'socks5://127.0.0.1:9050'
|
|
} if config.tor else {}
|
|
|
|
self.tor = config.tor
|
|
self.tor_valid = False
|
|
self.root_path = root_path
|
|
# Initialize HTTP client (shared per proxies)
|
|
self.http_client = http_client or get_http_client(self.proxies)
|
|
|
|
def __getitem__(self, name):
|
|
return getattr(self, name)
|
|
|
|
def autocomplete(self, query) -> list:
|
|
"""Sends a query to Google's search suggestion service
|
|
|
|
Args:
|
|
query: The in-progress query to send
|
|
|
|
Returns:
|
|
list: The list of matches for possible search suggestions
|
|
|
|
"""
|
|
# Check if autocomplete is disabled via environment variable
|
|
if os.environ.get('WHOOGLE_AUTOCOMPLETE', '1') == '0':
|
|
return []
|
|
|
|
try:
|
|
ac_query = dict(q=query)
|
|
if self.language:
|
|
ac_query['lr'] = self.language
|
|
if self.country:
|
|
ac_query['gl'] = self.country
|
|
if self.lang_interface:
|
|
ac_query['hl'] = self.lang_interface
|
|
|
|
response = self.send(base_url=AUTOCOMPLETE_URL,
|
|
query=urlparse.urlencode(ac_query)).text
|
|
|
|
if not response:
|
|
return []
|
|
|
|
try:
|
|
root = ET.fromstring(response)
|
|
return [_.attrib['data'] for _ in
|
|
root.findall('.//suggestion/[@data]')]
|
|
except ET.ParseError:
|
|
# Malformed XML response
|
|
return []
|
|
except Exception as e:
|
|
# Log the error but don't crash - autocomplete is non-essential
|
|
print(f"Autocomplete error: {str(e)}")
|
|
return []
|
|
|
|
def send(self, base_url='', query='', attempt=0,
|
|
force_mobile=False, user_agent=''):
|
|
"""Sends an outbound request to a URL. Optionally sends the request
|
|
using Tor, if enabled by the user.
|
|
|
|
Args:
|
|
base_url: The URL to use in the request
|
|
query: The optional query string for the request
|
|
attempt: The number of attempts made for the request
|
|
(used for cycling through Tor identities, if enabled)
|
|
force_mobile: Optional flag to enable a mobile user agent
|
|
(used for fetching full size images in search results)
|
|
|
|
Returns:
|
|
Response: The Response object returned by the requests call
|
|
|
|
"""
|
|
use_client_user_agent = int(os.environ.get('WHOOGLE_USE_CLIENT_USER_AGENT', '0'))
|
|
if user_agent and use_client_user_agent == 1:
|
|
modified_user_agent = user_agent
|
|
else:
|
|
if force_mobile and not self.mobile:
|
|
modified_user_agent = self.modified_user_agent_mobile
|
|
else:
|
|
modified_user_agent = self.modified_user_agent
|
|
|
|
headers = {
|
|
'User-Agent': modified_user_agent
|
|
}
|
|
|
|
# Adding the Accept-Language to the Header if possible
|
|
if self.lang_interface:
|
|
headers.update({'Accept-Language':
|
|
self.lang_interface.replace('lang_', '')
|
|
+ ';q=1.0'})
|
|
|
|
# view is suppressed correctly
|
|
now = datetime.now()
|
|
consent_cookie = 'CONSENT=PENDING+987; SOCS=CAESHAgBEhIaAB'
|
|
# Prefer header-based cookies to avoid httpx per-request cookies deprecation
|
|
if 'Cookie' in headers:
|
|
headers['Cookie'] += '; ' + consent_cookie
|
|
else:
|
|
headers['Cookie'] = consent_cookie
|
|
|
|
# Validate Tor conn and request new identity if the last one failed
|
|
if self.tor and not send_tor_signal(
|
|
Signal.NEWNYM if attempt > 0 else Signal.HEARTBEAT):
|
|
raise TorError(
|
|
"Tor was previously enabled, but the connection has been "
|
|
"dropped. Please check your Tor configuration and try again.",
|
|
disable=True)
|
|
|
|
# Make sure that the tor connection is valid, if enabled
|
|
if self.tor:
|
|
try:
|
|
tor_check = self.http_client.get('https://check.torproject.org/',
|
|
headers=headers,
|
|
retries=1)
|
|
self.tor_valid = 'Congratulations' in tor_check.text
|
|
|
|
if not self.tor_valid:
|
|
raise TorError(
|
|
"Tor connection succeeded, but the connection could "
|
|
"not be validated by torproject.org",
|
|
disable=True)
|
|
except httpx.RequestError:
|
|
raise TorError(
|
|
"Error raised during Tor connection validation",
|
|
disable=True)
|
|
|
|
try:
|
|
response = self.http_client.get(
|
|
(base_url or self.search_url) + query,
|
|
headers=headers)
|
|
except httpx.HTTPError as e:
|
|
raise
|
|
|
|
# Retry query with new identity if using Tor (max 10 attempts)
|
|
if 'form id="captcha-form"' in response.text and self.tor:
|
|
attempt += 1
|
|
if attempt > 10:
|
|
raise TorError("Tor query failed -- max attempts exceeded 10")
|
|
return self.send((base_url or self.search_url), query, attempt)
|
|
|
|
return response
|