whoogle-search/app/__init__.py
Don-Swanson 9b3a6ce550
Update README and codebase to enhance User Agent handling
- Revised README to reflect changes in Google search behavior and Whoogle's response strategies.
- Implemented a User Agent pool for improved request handling, including fallback mechanisms.
- Added configuration options for displaying the User Agent in search results.
- Introduced a command-line tool for generating custom User Agent strings.
- Enhanced request headers to include additional parameters for better compatibility with Google services.
2025-11-23 20:35:08 -06:00

268 lines
10 KiB
Python

from app.filter import clean_query
from app.request import send_tor_signal
from app.utils.session import generate_key
from app.utils.bangs import gen_bangs_json, load_all_bangs
from app.utils.misc import gen_file_hash, read_config_bool
from app.utils.ua_generator import load_ua_pool
from base64 import b64encode
from bs4 import MarkupResemblesLocatorWarning
from datetime import datetime, timedelta
from dotenv import load_dotenv
from flask import Flask
import json
import logging.config
import os
from stem import Signal
import threading
import warnings
from werkzeug.middleware.proxy_fix import ProxyFix
from app.utils.misc import read_config_bool
from app.services.http_client import HttpxClient
from app.services.provider import close_all_clients
from app.version import __version__
app = Flask(__name__, static_folder=os.path.dirname(
os.path.abspath(__file__)) + '/static')
app.wsgi_app = ProxyFix(app.wsgi_app)
# look for WHOOGLE_ENV, else look in parent directory
dot_env_path = os.getenv(
"WHOOGLE_DOTENV_PATH",
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../whoogle.env"))
# Load .env file if enabled
if os.path.exists(dot_env_path):
load_dotenv(dot_env_path)
app.enc_key = generate_key()
if read_config_bool('HTTPS_ONLY'):
app.config['SESSION_COOKIE_NAME'] = '__Secure-session'
app.config['SESSION_COOKIE_SECURE'] = True
app.config['VERSION_NUMBER'] = __version__
app.config['APP_ROOT'] = os.getenv(
'APP_ROOT',
os.path.dirname(os.path.abspath(__file__)))
app.config['STATIC_FOLDER'] = os.getenv(
'STATIC_FOLDER',
os.path.join(app.config['APP_ROOT'], 'static'))
app.config['BUILD_FOLDER'] = os.path.join(
app.config['STATIC_FOLDER'], 'build')
app.config['CACHE_BUSTING_MAP'] = {}
app.config['BUNDLE_STATIC'] = read_config_bool('WHOOGLE_BUNDLE_STATIC')
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/languages.json'), 'r', encoding='utf-8') as f:
app.config['LANGUAGES'] = json.load(f)
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/countries.json'), 'r', encoding='utf-8') as f:
app.config['COUNTRIES'] = json.load(f)
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/time_periods.json'), 'r', encoding='utf-8') as f:
app.config['TIME_PERIODS'] = json.load(f)
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/translations.json'), 'r', encoding='utf-8') as f:
app.config['TRANSLATIONS'] = json.load(f)
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/themes.json'), 'r', encoding='utf-8') as f:
app.config['THEMES'] = json.load(f)
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/header_tabs.json'), 'r', encoding='utf-8') as f:
app.config['HEADER_TABS'] = json.load(f)
app.config['CONFIG_PATH'] = os.getenv(
'CONFIG_VOLUME',
os.path.join(app.config['STATIC_FOLDER'], 'config'))
app.config['DEFAULT_CONFIG'] = os.path.join(
app.config['CONFIG_PATH'],
'config.json')
app.config['CONFIG_DISABLE'] = read_config_bool('WHOOGLE_CONFIG_DISABLE')
app.config['SESSION_FILE_DIR'] = os.path.join(
app.config['CONFIG_PATH'],
'session')
app.config['MAX_SESSION_SIZE'] = 4000 # Sessions won't exceed 4KB
app.config['BANG_PATH'] = os.getenv(
'CONFIG_VOLUME',
os.path.join(app.config['STATIC_FOLDER'], 'bangs'))
app.config['BANG_FILE'] = os.path.join(
app.config['BANG_PATH'],
'bangs.json')
# Global services registry (simple DI)
app.services = {}
@app.teardown_appcontext
def _teardown_clients(exception):
try:
close_all_clients()
except Exception:
pass
# Ensure all necessary directories exist
if not os.path.exists(app.config['CONFIG_PATH']):
os.makedirs(app.config['CONFIG_PATH'])
if not os.path.exists(app.config['SESSION_FILE_DIR']):
os.makedirs(app.config['SESSION_FILE_DIR'])
if not os.path.exists(app.config['BANG_PATH']):
os.makedirs(app.config['BANG_PATH'])
if not os.path.exists(app.config['BUILD_FOLDER']):
os.makedirs(app.config['BUILD_FOLDER'])
# Initialize User Agent pool
app.config['UA_CACHE_PATH'] = os.path.join(app.config['CONFIG_PATH'], 'ua_cache.json')
try:
app.config['UA_POOL'] = load_ua_pool(app.config['UA_CACHE_PATH'], count=10)
except Exception as e:
# If UA pool loading fails, log warning and set empty pool
# The gen_user_agent function will handle the fallback
print(f"Warning: Could not initialize UA pool: {e}")
app.config['UA_POOL'] = []
# Session values
app_key_path = os.path.join(app.config['CONFIG_PATH'], 'whoogle.key')
if os.path.exists(app_key_path):
try:
with open(app_key_path, 'r', encoding='utf-8') as f:
app.config['SECRET_KEY'] = f.read()
except PermissionError:
app.config['SECRET_KEY'] = str(b64encode(os.urandom(32)))
else:
app.config['SECRET_KEY'] = str(b64encode(os.urandom(32)))
with open(app_key_path, 'w', encoding='utf-8') as key_file:
key_file.write(app.config['SECRET_KEY'])
app.config['PERMANENT_SESSION_LIFETIME'] = timedelta(days=365)
# NOTE: SESSION_COOKIE_SAMESITE must be set to 'lax' to allow the user's
# previous session to persist when accessing the instance from an external
# link. Setting this value to 'strict' causes Whoogle to revalidate a new
# session, and fail, resulting in cookies being disabled.
app.config['SESSION_COOKIE_SAMESITE'] = 'Lax'
# Config fields that are used to check for updates
app.config['RELEASES_URL'] = 'https://github.com/' \
'benbusby/whoogle-search/releases'
app.config['LAST_UPDATE_CHECK'] = datetime.now() - timedelta(hours=24)
app.config['HAS_UPDATE'] = ''
# The alternative to Google Translate is treated a bit differently than other
# social media site alternatives, in that it is used for any translation
# related searches.
translate_url = os.getenv('WHOOGLE_ALT_TL', 'https://farside.link/lingva')
if not translate_url.startswith('http'):
translate_url = 'https://' + translate_url
app.config['TRANSLATE_URL'] = translate_url
app.config['CSP'] = 'default-src \'none\';' \
'frame-src ' + translate_url + ';' \
'manifest-src \'self\';' \
'img-src \'self\' data:;' \
'style-src \'self\' \'unsafe-inline\';' \
'script-src \'self\';' \
'media-src \'self\';' \
'connect-src \'self\';'
# Generate DDG bang filter
generating_bangs = False
if not os.path.exists(app.config['BANG_FILE']):
generating_bangs = True
with open(app.config['BANG_FILE'], 'w', encoding='utf-8') as f:
json.dump({}, f)
bangs_thread = threading.Thread(
target=gen_bangs_json,
args=(app.config['BANG_FILE'],))
bangs_thread.start()
# Build new mapping of static files for cache busting
cache_busting_dirs = ['css', 'js']
for cb_dir in cache_busting_dirs:
full_cb_dir = os.path.join(app.config['STATIC_FOLDER'], cb_dir)
for cb_file in os.listdir(full_cb_dir):
# Create hash from current file state
full_cb_path = os.path.join(full_cb_dir, cb_file)
cb_file_link = gen_file_hash(full_cb_dir, cb_file)
build_path = os.path.join(app.config['BUILD_FOLDER'], cb_file_link)
try:
os.symlink(full_cb_path, build_path)
except FileExistsError:
# Symlink hasn't changed, ignore
pass
# Create mapping for relative path urls
map_path = build_path.replace(app.config['APP_ROOT'], '')
if map_path.startswith('/'):
map_path = map_path[1:]
app.config['CACHE_BUSTING_MAP'][cb_file] = map_path
# Optionally create simple bundled assets (opt-in via WHOOGLE_BUNDLE_STATIC=1)
if app.config['BUNDLE_STATIC']:
# CSS bundle: include all css except theme files (end with -theme.css)
css_dir = os.path.join(app.config['STATIC_FOLDER'], 'css')
css_parts = []
for name in sorted(os.listdir(css_dir)):
if not name.endswith('.css'):
continue
if name.endswith('-theme.css'):
continue
try:
with open(os.path.join(css_dir, name), 'r', encoding='utf-8') as f:
css_parts.append(f.read())
except Exception:
pass
css_bundle = '\n'.join(css_parts)
if css_bundle:
css_tmp = os.path.join(app.config['BUILD_FOLDER'], 'app.css')
with open(css_tmp, 'w', encoding='utf-8') as f:
f.write(css_bundle)
css_hashed = gen_file_hash(app.config['BUILD_FOLDER'], 'app.css')
os.replace(css_tmp, os.path.join(app.config['BUILD_FOLDER'], css_hashed))
map_path = os.path.join('app/static/build', css_hashed)
app.config['CACHE_BUSTING_MAP']['bundle.css'] = map_path
# JS bundle: include all js files
js_dir = os.path.join(app.config['STATIC_FOLDER'], 'js')
js_parts = []
for name in sorted(os.listdir(js_dir)):
if not name.endswith('.js'):
continue
try:
with open(os.path.join(js_dir, name), 'r', encoding='utf-8') as f:
js_parts.append(f.read())
except Exception:
pass
js_bundle = '\n;'.join(js_parts)
if js_bundle:
js_tmp = os.path.join(app.config['BUILD_FOLDER'], 'app.js')
with open(js_tmp, 'w', encoding='utf-8') as f:
f.write(js_bundle)
js_hashed = gen_file_hash(app.config['BUILD_FOLDER'], 'app.js')
os.replace(js_tmp, os.path.join(app.config['BUILD_FOLDER'], js_hashed))
map_path = os.path.join('app/static/build', js_hashed)
app.config['CACHE_BUSTING_MAP']['bundle.js'] = map_path
# Templating functions
app.jinja_env.globals.update(clean_query=clean_query)
app.jinja_env.globals.update(
cb_url=lambda f: app.config['CACHE_BUSTING_MAP'][f.lower()])
app.jinja_env.globals.update(
bundle_static=lambda: app.config.get('BUNDLE_STATIC', False))
# Attempt to acquire tor identity, to determine if Tor config is available
send_tor_signal(Signal.HEARTBEAT)
# Suppress spurious warnings from BeautifulSoup
warnings.simplefilter('ignore', MarkupResemblesLocatorWarning)
from app import routes # noqa
# The gen_bangs_json function takes care of loading bangs, so skip it here if
# it's already being loaded
if not generating_bangs:
load_all_bangs(app.config['BANG_FILE'])
# Disable logging from imported modules
logging.config.dictConfig({
'version': 1,
'disable_existing_loggers': True,
})