From ffdeeb5f44fa7ec642858e12ae0be70620adfc9a Mon Sep 17 00:00:00 2001 From: Don-Swanson <32144818+Don-Swanson@users.noreply.github.com> Date: Tue, 23 Sep 2025 21:37:21 -0500 Subject: [PATCH] Enhance autocomplete functionality by adding environment variable check to enable/disable it globally. Improve error handling in HTTP client for closed connections and add client recreation logic. Refactor link extraction to avoid details elements in search results. --- app/request.py | 45 ++++++++++++++++++++-------------- app/routes.py | 4 ++- app/services/http_client.py | 49 ++++++++++++++++++++++++++++++++++++- app/utils/results.py | 18 +++++--------- app/utils/search.py | 5 ++++ 5 files changed, 89 insertions(+), 32 deletions(-) diff --git a/app/request.py b/app/request.py index 2c11c03..5f40bbc 100644 --- a/app/request.py +++ b/app/request.py @@ -265,26 +265,35 @@ class Request: list: The list of matches for possible search suggestions """ - ac_query = dict(q=query) - if self.language: - ac_query['lr'] = self.language - if self.country: - ac_query['gl'] = self.country - if self.lang_interface: - ac_query['hl'] = self.lang_interface - - response = self.send(base_url=AUTOCOMPLETE_URL, - query=urlparse.urlencode(ac_query)).text - - if not response: + # Check if autocomplete is disabled via environment variable + if os.environ.get('WHOOGLE_AUTOCOMPLETE', '1') == '0': return [] - + try: - root = ET.fromstring(response) - return [_.attrib['data'] for _ in - root.findall('.//suggestion/[@data]')] - except ET.ParseError: - # Malformed XML response + ac_query = dict(q=query) + if self.language: + ac_query['lr'] = self.language + if self.country: + ac_query['gl'] = self.country + if self.lang_interface: + ac_query['hl'] = self.lang_interface + + response = self.send(base_url=AUTOCOMPLETE_URL, + query=urlparse.urlencode(ac_query)).text + + if not response: + return [] + + try: + root = ET.fromstring(response) + return [_.attrib['data'] for _ in + root.findall('.//suggestion/[@data]')] + except ET.ParseError: + # Malformed XML response + return [] + except Exception as e: + # Log the error but don't crash - autocomplete is non-essential + print(f"Autocomplete error: {str(e)}") return [] def send(self, base_url='', query='', attempt=0, diff --git a/app/routes.py b/app/routes.py index 3e6c53e..258abb9 100644 --- a/app/routes.py +++ b/app/routes.py @@ -283,9 +283,11 @@ def autocomplete(): # # Note: If Tor is enabled, this returns nothing, as the request is # almost always rejected + # Also check if autocomplete is disabled globally + autocomplete_enabled = os.environ.get('WHOOGLE_AUTOCOMPLETE', '1') != '0' return jsonify([ q, - g.user_request.autocomplete(q) if not g.user_config.tor else [] + g.user_request.autocomplete(q) if (not g.user_config.tor and autocomplete_enabled) else [] ]) @app.route(f'/{Endpoint.search}', methods=['GET', 'POST']) diff --git a/app/services/http_client.py b/app/services/http_client.py index 9ca5d03..4f9730b 100644 --- a/app/services/http_client.py +++ b/app/services/http_client.py @@ -84,13 +84,22 @@ class HttpxClient: attempt = 0 while attempt <= retries: try: + # Check if client is closed and recreate if needed + if self._client.is_closed: + self._recreate_client() + response = self._client.get(url, headers=headers, cookies=cookies) if use_cache and response.status_code == 200: with self._cache_lock: self._cache[key] = response return response - except httpx.HTTPError as exc: + except (httpx.HTTPError, RuntimeError) as exc: last_exc = exc + if "client has been closed" in str(exc).lower(): + # Recreate client and try again + self._recreate_client() + if attempt < retries: + continue if attempt == retries: raise time.sleep(backoff_seconds * (2 ** attempt)) @@ -101,6 +110,44 @@ class HttpxClient: raise last_exc raise httpx.HTTPError('Unknown HTTP error') + def _recreate_client(self) -> None: + """Recreate the HTTP client when it has been closed.""" + try: + self._client.close() + except Exception: + pass # Client might already be closed + + # Recreate with same configuration + client_kwargs = dict(timeout=self._timeout_seconds, + follow_redirects=True) + + if self._proxies: + proxy_values = list(self._proxies.values()) + single_proxy = proxy_values[0] if proxy_values and all(v == proxy_values[0] for v in proxy_values) else None + if single_proxy: + try: + self._client = httpx.Client(proxy=single_proxy, **client_kwargs) + except TypeError: + try: + self._client = httpx.Client(proxies=self._proxies, **client_kwargs) + except TypeError: + mounts: Dict[str, httpx.Proxy] = {} + for scheme_key, url in self._proxies.items(): + prefix = f"{scheme_key}://" + mounts[prefix] = httpx.Proxy(url) + self._client = httpx.Client(mounts=mounts, **client_kwargs) + else: + try: + self._client = httpx.Client(proxies=self._proxies, **client_kwargs) + except TypeError: + mounts: Dict[str, httpx.Proxy] = {} + for scheme_key, url in self._proxies.items(): + prefix = f"{scheme_key}://" + mounts[prefix] = httpx.Proxy(url) + self._client = httpx.Client(mounts=mounts, **client_kwargs) + else: + self._client = httpx.Client(**client_kwargs) + def close(self) -> None: self._client.close() diff --git a/app/utils/results.py b/app/utils/results.py index d7b3991..bc7c910 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -136,7 +136,7 @@ def has_ad_content(element: str) -> bool: or 'ⓘ' in element) -def get_first_link(soup: BeautifulSoup) -> str: +def get_first_link(soup) -> str: """Retrieves the first result link from the query response Args: @@ -147,24 +147,18 @@ def get_first_link(soup: BeautifulSoup) -> str: """ first_link = '' - orig_details = [] - # Temporarily remove details so we don't grab those links - for details in soup.find_all('details'): - temp_details = soup.new_tag('removed_details') - orig_details.append(details.replace_with(temp_details)) - - # Replace hrefs with only the intended destination (no "utm" type tags) + # Find the first valid search result link, excluding details elements for a in soup.find_all('a', href=True): + # Skip links that are inside details elements (collapsible sections) + if a.find_parent('details'): + continue + # Return the first search result URL if a['href'].startswith('http://') or a['href'].startswith('https://'): first_link = a['href'] break - # Add the details back - for orig_detail, details in zip(orig_details, soup.find_all('removed_details')): - details.replace_with(orig_detail) - return first_link diff --git a/app/utils/search.py b/app/utils/search.py index bee3db7..19d60ca 100644 --- a/app/utils/search.py +++ b/app/utils/search.py @@ -161,6 +161,11 @@ class Search: # Produce cleanable html soup from response get_body_safed = get_body.text.replace("<","andlt;").replace(">","andgt;") html_soup = bsoup(get_body_safed, 'html.parser') + + # Ensure we extract only the content within if it exists + # This prevents doctype declarations from appearing in the output + if html_soup.html: + html_soup = html_soup.html # Replace current soup if view_image is active # FIXME: Broken since the user agent changes as of 16 Jan 2025