caddy/modules/caddyhttp/reverseproxy/passive_health_test.go
Francis Lavoie db2986028f
reverseproxy: Track dynamic upstreams, enable passive healthchecking (#7539)
* reverseproxy: Track dynamic upstreams, enable passive healthchecking

* Add tests for dynamic upstream tracking, admin endpoint, health checks
2026-03-04 15:05:26 -05:00

392 lines
12 KiB
Go

// Copyright 2015 Matthew Holt and The Caddy Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package reverseproxy
import (
"context"
"testing"
"time"
"github.com/caddyserver/caddy/v2"
)
// newPassiveHandler builds a minimal Handler with passive health checks
// configured and a live caddy.Context so the fail-forgetter goroutine can
// be cancelled cleanly. The caller must call cancel() when done.
func newPassiveHandler(t *testing.T, maxFails int, failDuration time.Duration) (*Handler, context.CancelFunc) {
t.Helper()
caddyCtx, cancel := caddy.NewContext(caddy.Context{Context: context.Background()})
h := &Handler{
ctx: caddyCtx,
HealthChecks: &HealthChecks{
Passive: &PassiveHealthChecks{
MaxFails: maxFails,
FailDuration: caddy.Duration(failDuration),
},
},
}
return h, cancel
}
// provisionedStaticUpstream creates a static upstream, registers it in the
// UsagePool, and returns a cleanup func that removes it from the pool.
func provisionedStaticUpstream(t *testing.T, h *Handler, addr string) (*Upstream, func()) {
t.Helper()
u := &Upstream{Dial: addr}
h.provisionUpstream(u, false)
return u, func() { _, _ = hosts.Delete(addr) }
}
// provisionedDynamicUpstream creates a dynamic upstream, registers it in
// dynamicHosts, and returns a cleanup func that removes it.
func provisionedDynamicUpstream(t *testing.T, h *Handler, addr string) (*Upstream, func()) {
t.Helper()
u := &Upstream{Dial: addr}
h.provisionUpstream(u, true)
return u, func() {
dynamicHostsMu.Lock()
delete(dynamicHosts, addr)
dynamicHostsMu.Unlock()
}
}
// --- countFailure behaviour ---
// TestCountFailureNoopWhenNoHealthChecks verifies that countFailure is a no-op
// when HealthChecks is nil.
func TestCountFailureNoopWhenNoHealthChecks(t *testing.T) {
resetDynamicHosts()
h := &Handler{}
u := &Upstream{Dial: "10.1.0.1:80", Host: new(Host)}
h.countFailure(u)
if u.Host.Fails() != 0 {
t.Errorf("expected 0 fails with no HealthChecks config, got %d", u.Host.Fails())
}
}
// TestCountFailureNoopWhenZeroDuration verifies that countFailure is a no-op
// when FailDuration is 0 (the zero value disables passive checks).
func TestCountFailureNoopWhenZeroDuration(t *testing.T) {
resetDynamicHosts()
caddyCtx, cancel := caddy.NewContext(caddy.Context{Context: context.Background()})
defer cancel()
h := &Handler{
ctx: caddyCtx,
HealthChecks: &HealthChecks{
Passive: &PassiveHealthChecks{MaxFails: 1, FailDuration: 0},
},
}
u := &Upstream{Dial: "10.1.0.2:80", Host: new(Host)}
h.countFailure(u)
if u.Host.Fails() != 0 {
t.Errorf("expected 0 fails with zero FailDuration, got %d", u.Host.Fails())
}
}
// TestCountFailureIncrementsCount verifies that countFailure increments the
// fail count on the upstream's Host.
func TestCountFailureIncrementsCount(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Minute)
defer cancel()
u := &Upstream{Dial: "10.1.0.3:80", Host: new(Host)}
h.countFailure(u)
if u.Host.Fails() != 1 {
t.Errorf("expected 1 fail after countFailure, got %d", u.Host.Fails())
}
}
// TestCountFailureDecrementsAfterDuration verifies that the fail count is
// decremented back after FailDuration elapses.
func TestCountFailureDecrementsAfterDuration(t *testing.T) {
resetDynamicHosts()
const failDuration = 50 * time.Millisecond
h, cancel := newPassiveHandler(t, 2, failDuration)
defer cancel()
u := &Upstream{Dial: "10.1.0.4:80", Host: new(Host)}
h.countFailure(u)
if u.Host.Fails() != 1 {
t.Fatalf("expected 1 fail immediately after countFailure, got %d", u.Host.Fails())
}
// Wait long enough for the forgetter goroutine to fire.
time.Sleep(3 * failDuration)
if u.Host.Fails() != 0 {
t.Errorf("expected fail count to return to 0 after FailDuration, got %d", u.Host.Fails())
}
}
// TestCountFailureCancelledContextForgets verifies that cancelling the handler
// context (simulating a config unload) also triggers the forgetter to run,
// decrementing the fail count.
func TestCountFailureCancelledContextForgets(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Hour) // very long duration
u := &Upstream{Dial: "10.1.0.5:80", Host: new(Host)}
h.countFailure(u)
if u.Host.Fails() != 1 {
t.Fatalf("expected 1 fail immediately after countFailure, got %d", u.Host.Fails())
}
// Cancelling the context should cause the forgetter goroutine to exit and
// decrement the count.
cancel()
time.Sleep(50 * time.Millisecond)
if u.Host.Fails() != 0 {
t.Errorf("expected fail count to be decremented after context cancel, got %d", u.Host.Fails())
}
}
// --- static upstream passive health check ---
// TestStaticUpstreamHealthyWithNoFailures verifies that a static upstream with
// no recorded failures is considered healthy.
func TestStaticUpstreamHealthyWithNoFailures(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Minute)
defer cancel()
u, cleanup := provisionedStaticUpstream(t, h, "10.2.0.1:80")
defer cleanup()
if !u.Healthy() {
t.Error("upstream with no failures should be healthy")
}
}
// TestStaticUpstreamUnhealthyAtMaxFails verifies that a static upstream is
// marked unhealthy once its fail count reaches MaxFails.
func TestStaticUpstreamUnhealthyAtMaxFails(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Minute)
defer cancel()
u, cleanup := provisionedStaticUpstream(t, h, "10.2.0.2:80")
defer cleanup()
h.countFailure(u)
if !u.Healthy() {
t.Error("upstream should still be healthy after 1 of 2 allowed failures")
}
h.countFailure(u)
if u.Healthy() {
t.Error("upstream should be unhealthy after reaching MaxFails=2")
}
}
// TestStaticUpstreamRecoversAfterFailDuration verifies that a static upstream
// returns to healthy once its failures expire.
func TestStaticUpstreamRecoversAfterFailDuration(t *testing.T) {
resetDynamicHosts()
const failDuration = 50 * time.Millisecond
h, cancel := newPassiveHandler(t, 1, failDuration)
defer cancel()
u, cleanup := provisionedStaticUpstream(t, h, "10.2.0.3:80")
defer cleanup()
h.countFailure(u)
if u.Healthy() {
t.Fatal("upstream should be unhealthy immediately after MaxFails failure")
}
time.Sleep(3 * failDuration)
if !u.Healthy() {
t.Errorf("upstream should recover to healthy after FailDuration, Fails=%d", u.Host.Fails())
}
}
// TestStaticUpstreamHealthPersistedAcrossReprovisioning verifies that static
// upstreams share a Host via the UsagePool, so a second call to provisionUpstream
// for the same address (as happens on config reload) sees the accumulated state.
func TestStaticUpstreamHealthPersistedAcrossReprovisioning(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Minute)
defer cancel()
u1, cleanup1 := provisionedStaticUpstream(t, h, "10.2.0.4:80")
defer cleanup1()
h.countFailure(u1)
h.countFailure(u1)
// Simulate a second handler instance referencing the same upstream
// (e.g. after a config reload that keeps the same backend address).
u2, cleanup2 := provisionedStaticUpstream(t, h, "10.2.0.4:80")
defer cleanup2()
if u1.Host != u2.Host {
t.Fatal("expected both Upstream structs to share the same *Host via UsagePool")
}
if u2.Healthy() {
t.Error("re-provisioned upstream should still see the prior fail count and be unhealthy")
}
}
// --- dynamic upstream passive health check ---
// TestDynamicUpstreamHealthyWithNoFailures verifies that a freshly provisioned
// dynamic upstream is healthy.
func TestDynamicUpstreamHealthyWithNoFailures(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Minute)
defer cancel()
u, cleanup := provisionedDynamicUpstream(t, h, "10.3.0.1:80")
defer cleanup()
if !u.Healthy() {
t.Error("dynamic upstream with no failures should be healthy")
}
}
// TestDynamicUpstreamUnhealthyAtMaxFails verifies that a dynamic upstream is
// marked unhealthy once its fail count reaches MaxFails.
func TestDynamicUpstreamUnhealthyAtMaxFails(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Minute)
defer cancel()
u, cleanup := provisionedDynamicUpstream(t, h, "10.3.0.2:80")
defer cleanup()
h.countFailure(u)
if !u.Healthy() {
t.Error("dynamic upstream should still be healthy after 1 of 2 allowed failures")
}
h.countFailure(u)
if u.Healthy() {
t.Error("dynamic upstream should be unhealthy after reaching MaxFails=2")
}
}
// TestDynamicUpstreamFailCountPersistedBetweenRequests is the core regression
// test: it simulates two sequential (non-concurrent) requests to the same
// dynamic upstream. Before the fix, the UsagePool entry would be deleted
// between requests, wiping the fail count. Now it should survive.
func TestDynamicUpstreamFailCountPersistedBetweenRequests(t *testing.T) {
resetDynamicHosts()
h, cancel := newPassiveHandler(t, 2, time.Minute)
defer cancel()
// --- first request ---
u1 := &Upstream{Dial: "10.3.0.3:80"}
h.provisionUpstream(u1, true)
h.countFailure(u1)
if u1.Host.Fails() != 1 {
t.Fatalf("expected 1 fail after first request, got %d", u1.Host.Fails())
}
// Simulate end of first request: no delete from any pool (key difference
// vs. the old behaviour where hosts.Delete was deferred).
// --- second request: brand-new *Upstream struct, same dial address ---
u2 := &Upstream{Dial: "10.3.0.3:80"}
h.provisionUpstream(u2, true)
if u1.Host != u2.Host {
t.Fatal("expected both requests to share the same *Host pointer from dynamicHosts")
}
if u2.Host.Fails() != 1 {
t.Errorf("expected fail count to persist across requests, got %d", u2.Host.Fails())
}
// A second failure now tips it over MaxFails=2.
h.countFailure(u2)
if u2.Healthy() {
t.Error("upstream should be unhealthy after accumulated failures across requests")
}
// Cleanup.
dynamicHostsMu.Lock()
delete(dynamicHosts, "10.3.0.3:80")
dynamicHostsMu.Unlock()
}
// TestDynamicUpstreamRecoveryAfterFailDuration verifies that a dynamic
// upstream's fail count expires and it returns to healthy.
func TestDynamicUpstreamRecoveryAfterFailDuration(t *testing.T) {
resetDynamicHosts()
const failDuration = 50 * time.Millisecond
h, cancel := newPassiveHandler(t, 1, failDuration)
defer cancel()
u, cleanup := provisionedDynamicUpstream(t, h, "10.3.0.4:80")
defer cleanup()
h.countFailure(u)
if u.Healthy() {
t.Fatal("upstream should be unhealthy immediately after MaxFails failure")
}
time.Sleep(3 * failDuration)
// Re-provision (as a new request would) to get fresh *Upstream with policy set.
u2 := &Upstream{Dial: "10.3.0.4:80"}
h.provisionUpstream(u2, true)
if !u2.Healthy() {
t.Errorf("dynamic upstream should recover to healthy after FailDuration, Fails=%d", u2.Host.Fails())
}
}
// TestDynamicUpstreamMaxRequestsFromUnhealthyRequestCount verifies that
// UnhealthyRequestCount is copied into MaxRequests so Full() works correctly.
func TestDynamicUpstreamMaxRequestsFromUnhealthyRequestCount(t *testing.T) {
resetDynamicHosts()
caddyCtx, cancel := caddy.NewContext(caddy.Context{Context: context.Background()})
defer cancel()
h := &Handler{
ctx: caddyCtx,
HealthChecks: &HealthChecks{
Passive: &PassiveHealthChecks{
UnhealthyRequestCount: 3,
},
},
}
u, cleanup := provisionedDynamicUpstream(t, h, "10.3.0.5:80")
defer cleanup()
if u.MaxRequests != 3 {
t.Errorf("expected MaxRequests=3 from UnhealthyRequestCount, got %d", u.MaxRequests)
}
// Should not be full with fewer requests than the limit.
_ = u.Host.countRequest(2)
if u.Full() {
t.Error("upstream should not be full with 2 of 3 allowed requests")
}
_ = u.Host.countRequest(1)
if !u.Full() {
t.Error("upstream should be full at UnhealthyRequestCount concurrent requests")
}
}