from app.models.config import Config from app.utils.misc import read_config_bool from app.services.provider import get_http_client from app.utils.ua_generator import load_ua_pool, get_random_ua, DEFAULT_FALLBACK_UA from defusedxml import ElementTree as ET import httpx import urllib.parse as urlparse import os from stem import Signal, SocketError from stem.connection import AuthenticationFailure from stem.control import Controller from stem.connection import authenticate_cookie, authenticate_password MAPS_URL = 'https://maps.google.com/maps' AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/' 'complete/search?client=toolbar&') # Valid query params VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr'] class TorError(Exception): """Exception raised for errors in Tor requests. Attributes: message: a message describing the error that occurred disable: optionally disables Tor in the user config (note: this should only happen if the connection has been dropped altogether). """ def __init__(self, message, disable=False) -> None: self.message = message self.disable = disable super().__init__(message) def send_tor_signal(signal: Signal) -> bool: use_pass = read_config_bool('WHOOGLE_TOR_USE_PASS') confloc = './misc/tor/control.conf' # Check that the custom location of conf is real. temp = os.getenv('WHOOGLE_TOR_CONF', '') if os.path.isfile(temp): confloc = temp # Attempt to authenticate and send signal. try: with Controller.from_port(port=9051) as c: if use_pass: with open(confloc, "r") as conf: # Scan for the last line of the file. for line in conf: pass secret = line.strip('\n') authenticate_password(c, password=secret) else: cookie_path = '/var/lib/tor/control_auth_cookie' authenticate_cookie(c, cookie_path=cookie_path) c.signal(signal) os.environ['TOR_AVAILABLE'] = '1' return True except (SocketError, AuthenticationFailure, ConnectionRefusedError, ConnectionError): # TODO: Handle Tor authentication (password and cookie) os.environ['TOR_AVAILABLE'] = '0' return False def gen_user_agent(config, is_mobile) -> str: # If using custom user agent, return the custom string if config.user_agent == 'custom' and config.custom_user_agent: return config.custom_user_agent # If using environment configuration if config.user_agent == 'env_conf': if is_mobile: env_ua = os.getenv('WHOOGLE_USER_AGENT_MOBILE', '') if env_ua: return env_ua else: env_ua = os.getenv('WHOOGLE_USER_AGENT', '') if env_ua: return env_ua # If env vars are not set, fall back to Opera UA return DEFAULT_FALLBACK_UA # If using default user agent - use auto-generated Opera UA pool if config.user_agent == 'default': try: # Try to load UA pool from cache (lazy loading if not in app.config) # First check if we have access to Flask app context try: from flask import current_app if hasattr(current_app, 'config') and 'UA_POOL' in current_app.config: ua_pool = current_app.config['UA_POOL'] else: # Fall back to loading from disk raise ImportError("UA_POOL not in app config") except (ImportError, RuntimeError): # No Flask context available or UA_POOL not in config, load from disk config_path = os.environ.get('CONFIG_VOLUME', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'config')) cache_path = os.path.join(config_path, 'ua_cache.json') ua_pool = load_ua_pool(cache_path, count=10) return get_random_ua(ua_pool) except Exception as e: # If anything goes wrong, fall back to default Opera UA print(f"Warning: Could not load UA pool, using fallback Opera UA: {e}") return DEFAULT_FALLBACK_UA # Fallback for backwards compatibility (old configs or invalid user_agent values) return DEFAULT_FALLBACK_UA def gen_query(query, args, config) -> str: param_dict = {key: '' for key in VALID_PARAMS} # Use :past(hour/day/week/month/year) if available # example search "new restaurants :past month" lang = '' if ':past' in query and 'tbs' not in args: time_range = str.strip(query.split(':past', 1)[-1]) param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0])) elif 'tbs' in args or 'tbs' in config: result_tbs = args.get('tbs') if 'tbs' in args else config['tbs'] param_dict['tbs'] = '&tbs=' + result_tbs # Occasionally the 'tbs' param provided by google also contains a # field for 'lr', but formatted strangely. This is a rough solution # for this. # # Example: # &tbs=qdr:h,lr:lang_1pl # -- the lr param needs to be extracted and remove the leading '1' result_params = [_ for _ in result_tbs.split(',') if 'lr:' in _] if len(result_params) > 0: result_param = result_params[0] lang = result_param[result_param.find('lr:') + 3:len(result_param)] # Ensure search query is parsable query = urlparse.quote(query) # Pass along type of results (news, images, books, etc) if 'tbm' in args: param_dict['tbm'] = '&tbm=' + args.get('tbm') # Get results page start value (10 per page, ie page 2 start val = 20) if 'start' in args: param_dict['start'] = '&start=' + args.get('start') # Search for results near a particular city, if available if config.near: param_dict['near'] = '&near=' + urlparse.quote(config.near) # Set language for results (lr) if source isn't set, otherwise use the # result language param provided in the results if 'source' in args: param_dict['source'] = '&source=' + args.get('source') param_dict['lr'] = ('&lr=' + ''.join( [_ for _ in lang if not _.isdigit()] )) if lang else '' else: param_dict['lr'] = ( '&lr=' + config.lang_search ) if config.lang_search else '' # 'nfpr' defines the exclusion of results from an auto-corrected query if 'nfpr' in args: param_dict['nfpr'] = '&nfpr=' + args.get('nfpr') # 'chips' is used in image tabs to pass the optional 'filter' to add to the # given search term if 'chips' in args: param_dict['chips'] = '&chips=' + args.get('chips') param_dict['gl'] = ( '&gl=' + config.country ) if config.country else '' param_dict['hl'] = ( '&hl=' + config.lang_interface.replace('lang_', '') ) if config.lang_interface else '' param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') # Block all sites specified in the user config unquoted_query = urlparse.unquote(query) for blocked_site in config.block.replace(' ', '').split(','): if not blocked_site: continue block = (' -site:' + blocked_site) query += block if block not in unquoted_query else '' for val in param_dict.values(): if not val: continue query += val return query class Request: """Class used for handling all outbound requests, including search queries, search suggestions, and loading of external content (images, audio, etc). Attributes: normal_ua: the user's current user agent root_path: the root path of the whoogle instance config: the user's current whoogle configuration """ def __init__(self, normal_ua, root_path, config: Config, http_client=None): self.search_url = 'https://www.google.com/search?gbv=1&num=' + str( os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q=' # Optionally send heartbeat to Tor to determine availability # Only when Tor is enabled in config to avoid unnecessary socket usage if config.tor: send_tor_signal(Signal.HEARTBEAT) self.language = config.lang_search if config.lang_search else '' self.country = config.country if config.country else '' # For setting Accept-language Header self.lang_interface = '' if config.accept_language: self.lang_interface = config.lang_interface self.mobile = bool(normal_ua) and ('Android' in normal_ua or 'iPhone' in normal_ua) # Generate user agent based on config self.modified_user_agent = gen_user_agent(config, self.mobile) if not self.mobile: self.modified_user_agent_mobile = gen_user_agent(config, True) # Set up proxy configuration proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '') if proxy_path: proxy_type = os.environ.get('WHOOGLE_PROXY_TYPE', '') proxy_user = os.environ.get('WHOOGLE_PROXY_USER', '') proxy_pass = os.environ.get('WHOOGLE_PROXY_PASS', '') auth_str = '' if proxy_user: auth_str = f'{proxy_user}:{proxy_pass}@' proxy_str = f'{proxy_type}://{auth_str}{proxy_path}' self.proxies = { 'https': proxy_str, 'http': proxy_str } else: self.proxies = { 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050' } if config.tor else {} self.tor = config.tor self.tor_valid = False self.root_path = root_path # Initialize HTTP client (shared per proxies) self.http_client = http_client or get_http_client(self.proxies) def __getitem__(self, name): return getattr(self, name) def autocomplete(self, query) -> list: """Sends a query to Google's search suggestion service Args: query: The in-progress query to send Returns: list: The list of matches for possible search suggestions """ # Check if autocomplete is disabled via environment variable if os.environ.get('WHOOGLE_AUTOCOMPLETE', '1') == '0': return [] try: ac_query = dict(q=query) if self.language: ac_query['lr'] = self.language if self.country: ac_query['gl'] = self.country if self.lang_interface: ac_query['hl'] = self.lang_interface response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text if not response: return [] try: root = ET.fromstring(response) return [_.attrib['data'] for _ in root.findall('.//suggestion/[@data]')] except ET.ParseError: # Malformed XML response return [] except Exception as e: # Log the error but don't crash - autocomplete is non-essential print(f"Autocomplete error: {str(e)}") return [] def send(self, base_url='', query='', attempt=0, force_mobile=False, user_agent=''): """Sends an outbound request to a URL. Optionally sends the request using Tor, if enabled by the user. Args: base_url: The URL to use in the request query: The optional query string for the request attempt: The number of attempts made for the request (used for cycling through Tor identities, if enabled) force_mobile: Optional flag to enable a mobile user agent (used for fetching full size images in search results) Returns: Response: The Response object returned by the requests call """ use_client_user_agent = int(os.environ.get('WHOOGLE_USE_CLIENT_USER_AGENT', '0')) if user_agent and use_client_user_agent == 1: modified_user_agent = user_agent else: if force_mobile and not self.mobile: modified_user_agent = self.modified_user_agent_mobile else: modified_user_agent = self.modified_user_agent headers = { 'User-Agent': modified_user_agent, 'Accept': ('text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'), 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document', 'Sec-CH-UA': ( '"Not/A)Brand";v="8", ' '"Chromium";v="127", ' '"Google Chrome";v="127"' ), 'Sec-CH-UA-Mobile': '?0', 'Sec-CH-UA-Platform': '"macOS"' } # Add Accept-Language header tied to the current config if requested if self.lang_interface: headers['Accept-Language'] = ( self.lang_interface.replace('lang_', '') + ';q=1.0' ) # Consent cookies keep Google from showing the interstitial consent wall consent_cookies = { 'CONSENT': 'PENDING+987', 'SOCS': 'CAESHAgBEhIaAB' } # Validate Tor conn and request new identity if the last one failed if self.tor and not send_tor_signal( Signal.NEWNYM if attempt > 0 else Signal.HEARTBEAT): raise TorError( "Tor was previously enabled, but the connection has been " "dropped. Please check your Tor configuration and try again.", disable=True) # Make sure that the tor connection is valid, if enabled if self.tor: try: tor_check = self.http_client.get('https://check.torproject.org/', headers=headers, retries=1) self.tor_valid = 'Congratulations' in tor_check.text if not self.tor_valid: raise TorError( "Tor connection succeeded, but the connection could " "not be validated by torproject.org", disable=True) except httpx.RequestError: raise TorError( "Error raised during Tor connection validation", disable=True) try: response = self.http_client.get( (base_url or self.search_url) + query, headers=headers, cookies=consent_cookies) except httpx.HTTPError as e: raise # Retry query with new identity if using Tor (max 10 attempts) if 'form id="captcha-form"' in response.text and self.tor: attempt += 1 if attempt > 10: raise TorError("Tor query failed -- max attempts exceeded 10") return self.send((base_url or self.search_url), query, attempt) return response