mirror of
https://github.com/benbusby/whoogle-search.git
synced 2026-01-30 15:43:38 -05:00
220 lines
8.6 KiB
Python
220 lines
8.6 KiB
Python
import threading
|
|
import time
|
|
from typing import Any, Dict, Optional, Tuple
|
|
|
|
import httpx
|
|
from cachetools import TTLCache
|
|
import ssl
|
|
import os
|
|
|
|
# Import h2 exceptions for better error handling
|
|
try:
|
|
from h2.exceptions import ProtocolError as H2ProtocolError
|
|
except ImportError:
|
|
H2ProtocolError = None
|
|
|
|
|
|
class HttpxClient:
|
|
"""Thin wrapper around httpx.Client providing simple retries and optional TTL caching.
|
|
|
|
The client is intended to be safe for reuse across requests. Per-request
|
|
overrides for headers/cookies are supported.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
proxies: Optional[Dict[str, str]] = None,
|
|
timeout_seconds: float = 15.0,
|
|
cache_ttl_seconds: int = 30,
|
|
cache_maxsize: int = 256,
|
|
http2: bool = True) -> None:
|
|
# Allow disabling HTTP/2 via environment variable
|
|
# HTTP/2 can sometimes cause protocol errors with certain servers
|
|
if os.environ.get('WHOOGLE_DISABLE_HTTP2', '').lower() in ('1', 'true', 't', 'yes', 'y'):
|
|
http2 = False
|
|
|
|
client_kwargs = dict(http2=http2,
|
|
timeout=timeout_seconds,
|
|
follow_redirects=True)
|
|
# Prefer future-proof mounts when proxies are provided; fall back to proxies=
|
|
self._proxies = proxies or {}
|
|
self._http2 = http2
|
|
|
|
# Determine verify behavior and initialize client with fallbacks
|
|
self._verify = self._determine_verify_setting()
|
|
try:
|
|
self._client = self._build_client(client_kwargs, self._verify)
|
|
except ssl.SSLError:
|
|
# Fallback to system trust store
|
|
try:
|
|
system_ctx = ssl.create_default_context()
|
|
self._client = self._build_client(client_kwargs, system_ctx)
|
|
self._verify = system_ctx
|
|
except ssl.SSLError:
|
|
insecure_fallback = os.environ.get('WHOOGLE_INSECURE_FALLBACK', '0').lower() in ('1', 'true', 't', 'yes', 'y')
|
|
if insecure_fallback:
|
|
self._client = self._build_client(client_kwargs, False)
|
|
self._verify = False
|
|
else:
|
|
raise
|
|
self._timeout_seconds = timeout_seconds
|
|
self._cache = TTLCache(maxsize=cache_maxsize, ttl=cache_ttl_seconds)
|
|
self._cache_lock = threading.Lock()
|
|
|
|
def _determine_verify_setting(self):
|
|
"""Determine SSL verification setting from environment.
|
|
|
|
Honors:
|
|
- WHOOGLE_CA_BUNDLE: path to CA bundle file
|
|
- WHOOGLE_SSL_VERIFY: '0' to disable verification
|
|
- WHOOGLE_SSL_BACKEND: 'system' to prefer system trust store
|
|
"""
|
|
ca_bundle = os.environ.get('WHOOGLE_CA_BUNDLE', '').strip()
|
|
if ca_bundle:
|
|
return ca_bundle
|
|
|
|
verify_env = os.environ.get('WHOOGLE_SSL_VERIFY', '1').lower()
|
|
if verify_env in ('0', 'false', 'no', 'n'):
|
|
return False
|
|
|
|
backend = os.environ.get('WHOOGLE_SSL_BACKEND', '').lower()
|
|
if backend == 'system':
|
|
return ssl.create_default_context()
|
|
|
|
return True
|
|
|
|
def _build_client(self, client_kwargs: Dict[str, Any], verify: Any) -> httpx.Client:
|
|
"""Construct httpx.Client with proxies and provided verify setting."""
|
|
kwargs = dict(client_kwargs)
|
|
kwargs['verify'] = verify
|
|
if self._proxies:
|
|
proxy_values = list(self._proxies.values())
|
|
single_proxy = proxy_values[0] if proxy_values and all(v == proxy_values[0] for v in proxy_values) else None
|
|
if single_proxy:
|
|
try:
|
|
return httpx.Client(proxy=single_proxy, **kwargs)
|
|
except TypeError:
|
|
try:
|
|
return httpx.Client(proxies=self._proxies, **kwargs)
|
|
except TypeError:
|
|
mounts: Dict[str, httpx.Proxy] = {}
|
|
for scheme_key, url in self._proxies.items():
|
|
prefix = f"{scheme_key}://"
|
|
mounts[prefix] = httpx.Proxy(url)
|
|
return httpx.Client(mounts=mounts, **kwargs)
|
|
else:
|
|
try:
|
|
return httpx.Client(proxies=self._proxies, **kwargs)
|
|
except TypeError:
|
|
mounts: Dict[str, httpx.Proxy] = {}
|
|
for scheme_key, url in self._proxies.items():
|
|
prefix = f"{scheme_key}://"
|
|
mounts[prefix] = httpx.Proxy(url)
|
|
return httpx.Client(mounts=mounts, **kwargs)
|
|
else:
|
|
return httpx.Client(**kwargs)
|
|
|
|
@property
|
|
def proxies(self) -> Dict[str, str]:
|
|
return self._proxies
|
|
|
|
def _cache_key(self, method: str, url: str, headers: Optional[Dict[str, str]]) -> Tuple[str, str, Tuple[Tuple[str, str], ...]]:
|
|
normalized_headers = tuple(sorted((headers or {}).items()))
|
|
return (method.upper(), url, normalized_headers)
|
|
|
|
def get(self,
|
|
url: str,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
cookies: Optional[Dict[str, str]] = None,
|
|
retries: int = 2,
|
|
backoff_seconds: float = 0.5,
|
|
use_cache: bool = False) -> httpx.Response:
|
|
if use_cache:
|
|
key = self._cache_key('GET', url, headers)
|
|
with self._cache_lock:
|
|
cached = self._cache.get(key)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
last_exc: Optional[Exception] = None
|
|
attempt = 0
|
|
while attempt <= retries:
|
|
try:
|
|
# Check if client is closed and recreate if needed
|
|
if self._client.is_closed:
|
|
self._recreate_client()
|
|
|
|
response = self._client.get(url, headers=headers, cookies=cookies)
|
|
if use_cache and response.status_code == 200:
|
|
with self._cache_lock:
|
|
self._cache[key] = response
|
|
return response
|
|
except Exception as exc:
|
|
last_exc = exc
|
|
# Check for specific errors that require client recreation
|
|
should_recreate = False
|
|
|
|
if isinstance(exc, (httpx.HTTPError, RuntimeError)):
|
|
if "client has been closed" in str(exc).lower():
|
|
should_recreate = True
|
|
|
|
# Handle H2 protocol errors (connection state issues)
|
|
if H2ProtocolError and isinstance(exc, H2ProtocolError):
|
|
should_recreate = True
|
|
|
|
# Also check if the error message contains h2 protocol error info
|
|
if "ProtocolError" in str(exc) or "ConnectionState.CLOSED" in str(exc):
|
|
should_recreate = True
|
|
|
|
if should_recreate:
|
|
self._recreate_client()
|
|
if attempt < retries:
|
|
time.sleep(backoff_seconds * (2 ** attempt))
|
|
attempt += 1
|
|
continue
|
|
|
|
# For non-recoverable errors or last attempt, raise
|
|
if attempt == retries:
|
|
raise
|
|
|
|
# For other errors, still retry with backoff
|
|
time.sleep(backoff_seconds * (2 ** attempt))
|
|
attempt += 1
|
|
|
|
# Should not reach here
|
|
if last_exc:
|
|
raise last_exc
|
|
raise httpx.HTTPError('Unknown HTTP error')
|
|
|
|
def _recreate_client(self) -> None:
|
|
"""Recreate the HTTP client when it has been closed."""
|
|
try:
|
|
self._client.close()
|
|
except Exception:
|
|
pass # Client might already be closed
|
|
|
|
# Recreate with same configuration
|
|
client_kwargs = dict(timeout=self._timeout_seconds,
|
|
follow_redirects=True,
|
|
http2=self._http2)
|
|
|
|
try:
|
|
self._client = self._build_client(client_kwargs, self._verify)
|
|
except ssl.SSLError:
|
|
try:
|
|
system_ctx = ssl.create_default_context()
|
|
self._client = self._build_client(client_kwargs, system_ctx)
|
|
self._verify = system_ctx
|
|
except ssl.SSLError:
|
|
insecure_fallback = os.environ.get('WHOOGLE_INSECURE_FALLBACK', '0').lower() in ('1', 'true', 't', 'yes', 'y')
|
|
if insecure_fallback:
|
|
self._client = self._build_client(client_kwargs, False)
|
|
self._verify = False
|
|
else:
|
|
raise
|
|
|
|
def close(self) -> None:
|
|
self._client.close()
|
|
|
|
|