whoogle-search/app/services/http_client.py

220 lines
8.6 KiB
Python

import threading
import time
from typing import Any, Dict, Optional, Tuple
import httpx
from cachetools import TTLCache
import ssl
import os
# Import h2 exceptions for better error handling
try:
from h2.exceptions import ProtocolError as H2ProtocolError
except ImportError:
H2ProtocolError = None
class HttpxClient:
"""Thin wrapper around httpx.Client providing simple retries and optional TTL caching.
The client is intended to be safe for reuse across requests. Per-request
overrides for headers/cookies are supported.
"""
def __init__(
self,
proxies: Optional[Dict[str, str]] = None,
timeout_seconds: float = 15.0,
cache_ttl_seconds: int = 30,
cache_maxsize: int = 256,
http2: bool = True) -> None:
# Allow disabling HTTP/2 via environment variable
# HTTP/2 can sometimes cause protocol errors with certain servers
if os.environ.get('WHOOGLE_DISABLE_HTTP2', '').lower() in ('1', 'true', 't', 'yes', 'y'):
http2 = False
client_kwargs = dict(http2=http2,
timeout=timeout_seconds,
follow_redirects=True)
# Prefer future-proof mounts when proxies are provided; fall back to proxies=
self._proxies = proxies or {}
self._http2 = http2
# Determine verify behavior and initialize client with fallbacks
self._verify = self._determine_verify_setting()
try:
self._client = self._build_client(client_kwargs, self._verify)
except ssl.SSLError:
# Fallback to system trust store
try:
system_ctx = ssl.create_default_context()
self._client = self._build_client(client_kwargs, system_ctx)
self._verify = system_ctx
except ssl.SSLError:
insecure_fallback = os.environ.get('WHOOGLE_INSECURE_FALLBACK', '0').lower() in ('1', 'true', 't', 'yes', 'y')
if insecure_fallback:
self._client = self._build_client(client_kwargs, False)
self._verify = False
else:
raise
self._timeout_seconds = timeout_seconds
self._cache = TTLCache(maxsize=cache_maxsize, ttl=cache_ttl_seconds)
self._cache_lock = threading.Lock()
def _determine_verify_setting(self):
"""Determine SSL verification setting from environment.
Honors:
- WHOOGLE_CA_BUNDLE: path to CA bundle file
- WHOOGLE_SSL_VERIFY: '0' to disable verification
- WHOOGLE_SSL_BACKEND: 'system' to prefer system trust store
"""
ca_bundle = os.environ.get('WHOOGLE_CA_BUNDLE', '').strip()
if ca_bundle:
return ca_bundle
verify_env = os.environ.get('WHOOGLE_SSL_VERIFY', '1').lower()
if verify_env in ('0', 'false', 'no', 'n'):
return False
backend = os.environ.get('WHOOGLE_SSL_BACKEND', '').lower()
if backend == 'system':
return ssl.create_default_context()
return True
def _build_client(self, client_kwargs: Dict[str, Any], verify: Any) -> httpx.Client:
"""Construct httpx.Client with proxies and provided verify setting."""
kwargs = dict(client_kwargs)
kwargs['verify'] = verify
if self._proxies:
proxy_values = list(self._proxies.values())
single_proxy = proxy_values[0] if proxy_values and all(v == proxy_values[0] for v in proxy_values) else None
if single_proxy:
try:
return httpx.Client(proxy=single_proxy, **kwargs)
except TypeError:
try:
return httpx.Client(proxies=self._proxies, **kwargs)
except TypeError:
mounts: Dict[str, httpx.Proxy] = {}
for scheme_key, url in self._proxies.items():
prefix = f"{scheme_key}://"
mounts[prefix] = httpx.Proxy(url)
return httpx.Client(mounts=mounts, **kwargs)
else:
try:
return httpx.Client(proxies=self._proxies, **kwargs)
except TypeError:
mounts: Dict[str, httpx.Proxy] = {}
for scheme_key, url in self._proxies.items():
prefix = f"{scheme_key}://"
mounts[prefix] = httpx.Proxy(url)
return httpx.Client(mounts=mounts, **kwargs)
else:
return httpx.Client(**kwargs)
@property
def proxies(self) -> Dict[str, str]:
return self._proxies
def _cache_key(self, method: str, url: str, headers: Optional[Dict[str, str]]) -> Tuple[str, str, Tuple[Tuple[str, str], ...]]:
normalized_headers = tuple(sorted((headers or {}).items()))
return (method.upper(), url, normalized_headers)
def get(self,
url: str,
headers: Optional[Dict[str, str]] = None,
cookies: Optional[Dict[str, str]] = None,
retries: int = 2,
backoff_seconds: float = 0.5,
use_cache: bool = False) -> httpx.Response:
if use_cache:
key = self._cache_key('GET', url, headers)
with self._cache_lock:
cached = self._cache.get(key)
if cached is not None:
return cached
last_exc: Optional[Exception] = None
attempt = 0
while attempt <= retries:
try:
# Check if client is closed and recreate if needed
if self._client.is_closed:
self._recreate_client()
response = self._client.get(url, headers=headers, cookies=cookies)
if use_cache and response.status_code == 200:
with self._cache_lock:
self._cache[key] = response
return response
except Exception as exc:
last_exc = exc
# Check for specific errors that require client recreation
should_recreate = False
if isinstance(exc, (httpx.HTTPError, RuntimeError)):
if "client has been closed" in str(exc).lower():
should_recreate = True
# Handle H2 protocol errors (connection state issues)
if H2ProtocolError and isinstance(exc, H2ProtocolError):
should_recreate = True
# Also check if the error message contains h2 protocol error info
if "ProtocolError" in str(exc) or "ConnectionState.CLOSED" in str(exc):
should_recreate = True
if should_recreate:
self._recreate_client()
if attempt < retries:
time.sleep(backoff_seconds * (2 ** attempt))
attempt += 1
continue
# For non-recoverable errors or last attempt, raise
if attempt == retries:
raise
# For other errors, still retry with backoff
time.sleep(backoff_seconds * (2 ** attempt))
attempt += 1
# Should not reach here
if last_exc:
raise last_exc
raise httpx.HTTPError('Unknown HTTP error')
def _recreate_client(self) -> None:
"""Recreate the HTTP client when it has been closed."""
try:
self._client.close()
except Exception:
pass # Client might already be closed
# Recreate with same configuration
client_kwargs = dict(timeout=self._timeout_seconds,
follow_redirects=True,
http2=self._http2)
try:
self._client = self._build_client(client_kwargs, self._verify)
except ssl.SSLError:
try:
system_ctx = ssl.create_default_context()
self._client = self._build_client(client_kwargs, system_ctx)
self._verify = system_ctx
except ssl.SSLError:
insecure_fallback = os.environ.get('WHOOGLE_INSECURE_FALLBACK', '0').lower() in ('1', 'true', 't', 'yes', 'y')
if insecure_fallback:
self._client = self._build_client(client_kwargs, False)
self._verify = False
else:
raise
def close(self) -> None:
self._client.close()