mirror of
https://github.com/caddyserver/caddy.git
synced 2026-03-07 17:45:50 -05:00
* perf: collect metrics once per route instead of per handler (#4644) Move Prometheus metrics instrumentation from the per-handler level to the per-route level. Previously, every middleware handler in a route was individually wrapped with metricsInstrumentedHandler, causing metrics to be collected N times per request (once per handler in the chain). Since all handlers in a route see the same request, these per-handler metrics were redundant and added significant CPU overhead (73% of request handling time per the original profiling). The fix introduces metricsInstrumentedRoute which wraps the entire compiled handler chain once in wrapRoute, collecting metrics only when the route actually matches. The handler label uses the first handler's module name, which is the most meaningful identifier for the route. Benchmark results (5 handlers per route): Old (per-handler): ~4650 ns/op, 4400 B/op, 45 allocs/op New (per-route): ~940 ns/op, 816 B/op, 8 allocs/op Improvement: ~5x faster, ~5.4x less memory, ~5.6x fewer allocs Signed-off-by: Varun Chawla <varun_6april@hotmail.com> * Remove unused metricsInstrumentedHandler code Delete the metricsInstrumentedHandler type, its constructor, and ServeHTTP method since they are no longer used after switching to route-level metrics collection via metricsInstrumentedRoute. Also remove the unused metrics parameter from wrapMiddleware and the middlewareHandlerFunc test helper, and convert existing tests to use the new route-level API. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Address review feedback: restore comments, move function to bottom - Move computeApproximateRequestSize back to bottom of file to minimize diff - Restore all useful comments that were accidentally dropped - Old metricsInstrumentedHandler already removed in previous commit --------- Signed-off-by: Varun Chawla <varun_6april@hotmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
331 lines
11 KiB
Go
331 lines
11 KiB
Go
package caddyhttp
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"net/http"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
|
|
"github.com/caddyserver/caddy/v2"
|
|
"github.com/caddyserver/caddy/v2/internal/metrics"
|
|
)
|
|
|
|
// Metrics configures metrics observations.
|
|
// EXPERIMENTAL and subject to change or removal.
|
|
//
|
|
// Example configuration:
|
|
//
|
|
// {
|
|
// "apps": {
|
|
// "http": {
|
|
// "metrics": {
|
|
// "per_host": true,
|
|
// "observe_catchall_hosts": false
|
|
// },
|
|
// "servers": {
|
|
// "srv0": {
|
|
// "routes": [{
|
|
// "match": [{"host": ["example.com", "www.example.com"]}],
|
|
// "handle": [{"handler": "static_response", "body": "Hello"}]
|
|
// }]
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
//
|
|
// In this configuration:
|
|
// - Requests to example.com and www.example.com get individual host labels
|
|
// - All other hosts (e.g., attacker.com) are aggregated under "_other" label
|
|
// - This prevents unlimited cardinality from arbitrary Host headers
|
|
type Metrics struct {
|
|
// Enable per-host metrics. Enabling this option may
|
|
// incur high-memory consumption, depending on the number of hosts
|
|
// managed by Caddy.
|
|
//
|
|
// CARDINALITY PROTECTION: To prevent unbounded cardinality attacks,
|
|
// only explicitly configured hosts (via host matchers) are allowed
|
|
// by default. Other hosts are aggregated under the "_other" label.
|
|
// See AllowCatchAllHosts to change this behavior.
|
|
PerHost bool `json:"per_host,omitempty"`
|
|
|
|
// Allow metrics for catch-all hosts (hosts without explicit configuration).
|
|
// When false (default), only hosts explicitly configured via host matchers
|
|
// will get individual metrics labels. All other hosts will be aggregated
|
|
// under the "_other" label to prevent cardinality explosion.
|
|
//
|
|
// This is automatically enabled for HTTPS servers (since certificates provide
|
|
// some protection against unbounded cardinality), but disabled for HTTP servers
|
|
// by default to prevent cardinality attacks from arbitrary Host headers.
|
|
//
|
|
// Set to true to allow all hosts to get individual metrics (NOT RECOMMENDED
|
|
// for production environments exposed to the internet).
|
|
ObserveCatchallHosts bool `json:"observe_catchall_hosts,omitempty"`
|
|
|
|
init sync.Once
|
|
httpMetrics *httpMetrics
|
|
allowedHosts map[string]struct{}
|
|
hasHTTPSServer bool
|
|
}
|
|
|
|
type httpMetrics struct {
|
|
requestInFlight *prometheus.GaugeVec
|
|
requestCount *prometheus.CounterVec
|
|
requestErrors *prometheus.CounterVec
|
|
requestDuration *prometheus.HistogramVec
|
|
requestSize *prometheus.HistogramVec
|
|
responseSize *prometheus.HistogramVec
|
|
responseDuration *prometheus.HistogramVec
|
|
}
|
|
|
|
func initHTTPMetrics(ctx caddy.Context, metrics *Metrics) {
|
|
const ns, sub = "caddy", "http"
|
|
registry := ctx.GetMetricsRegistry()
|
|
basicLabels := []string{"server", "handler"}
|
|
if metrics.PerHost {
|
|
basicLabels = append(basicLabels, "host")
|
|
}
|
|
metrics.httpMetrics.requestInFlight = promauto.With(registry).NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: ns,
|
|
Subsystem: sub,
|
|
Name: "requests_in_flight",
|
|
Help: "Number of requests currently handled by this server.",
|
|
}, basicLabels)
|
|
metrics.httpMetrics.requestErrors = promauto.With(registry).NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: ns,
|
|
Subsystem: sub,
|
|
Name: "request_errors_total",
|
|
Help: "Number of requests resulting in middleware errors.",
|
|
}, basicLabels)
|
|
metrics.httpMetrics.requestCount = promauto.With(registry).NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: ns,
|
|
Subsystem: sub,
|
|
Name: "requests_total",
|
|
Help: "Counter of HTTP(S) requests made.",
|
|
}, basicLabels)
|
|
|
|
// TODO: allow these to be customized in the config
|
|
durationBuckets := prometheus.DefBuckets
|
|
sizeBuckets := prometheus.ExponentialBuckets(256, 4, 8)
|
|
|
|
httpLabels := []string{"server", "handler", "code", "method"}
|
|
if metrics.PerHost {
|
|
httpLabels = append(httpLabels, "host")
|
|
}
|
|
metrics.httpMetrics.requestDuration = promauto.With(registry).NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: ns,
|
|
Subsystem: sub,
|
|
Name: "request_duration_seconds",
|
|
Help: "Histogram of round-trip request durations.",
|
|
Buckets: durationBuckets,
|
|
}, httpLabels)
|
|
metrics.httpMetrics.requestSize = promauto.With(registry).NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: ns,
|
|
Subsystem: sub,
|
|
Name: "request_size_bytes",
|
|
Help: "Total size of the request. Includes body",
|
|
Buckets: sizeBuckets,
|
|
}, httpLabels)
|
|
metrics.httpMetrics.responseSize = promauto.With(registry).NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: ns,
|
|
Subsystem: sub,
|
|
Name: "response_size_bytes",
|
|
Help: "Size of the returned response.",
|
|
Buckets: sizeBuckets,
|
|
}, httpLabels)
|
|
metrics.httpMetrics.responseDuration = promauto.With(registry).NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: ns,
|
|
Subsystem: sub,
|
|
Name: "response_duration_seconds",
|
|
Help: "Histogram of times to first byte in response bodies.",
|
|
Buckets: durationBuckets,
|
|
}, httpLabels)
|
|
}
|
|
|
|
// scanConfigForHosts scans the HTTP app configuration to build a set of allowed hosts
|
|
// for metrics collection, similar to how auto-HTTPS scans for domain names.
|
|
func (m *Metrics) scanConfigForHosts(app *App) {
|
|
if !m.PerHost {
|
|
return
|
|
}
|
|
|
|
m.allowedHosts = make(map[string]struct{})
|
|
m.hasHTTPSServer = false
|
|
|
|
for _, srv := range app.Servers {
|
|
// Check if this server has TLS enabled
|
|
serverHasTLS := len(srv.TLSConnPolicies) > 0
|
|
if serverHasTLS {
|
|
m.hasHTTPSServer = true
|
|
}
|
|
|
|
// Collect hosts from route matchers
|
|
for _, route := range srv.Routes {
|
|
for _, matcherSet := range route.MatcherSets {
|
|
for _, matcher := range matcherSet {
|
|
if hm, ok := matcher.(*MatchHost); ok {
|
|
for _, host := range *hm {
|
|
// Only allow non-fuzzy hosts to prevent unbounded cardinality
|
|
if !hm.fuzzy(host) {
|
|
m.allowedHosts[strings.ToLower(host)] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// shouldAllowHostMetrics determines if metrics should be collected for the given host.
|
|
// This implements the cardinality protection by only allowing metrics for:
|
|
// 1. Explicitly configured hosts
|
|
// 2. Catch-all requests on HTTPS servers (if AllowCatchAllHosts is true or auto-enabled)
|
|
// 3. Catch-all requests on HTTP servers only if explicitly allowed
|
|
func (m *Metrics) shouldAllowHostMetrics(host string, isHTTPS bool) bool {
|
|
if !m.PerHost {
|
|
return true // host won't be used in labels anyway
|
|
}
|
|
|
|
normalizedHost := strings.ToLower(host)
|
|
|
|
// Always allow explicitly configured hosts
|
|
if _, exists := m.allowedHosts[normalizedHost]; exists {
|
|
return true
|
|
}
|
|
|
|
// For catch-all requests (not in allowed hosts)
|
|
allowCatchAll := m.ObserveCatchallHosts || (isHTTPS && m.hasHTTPSServer)
|
|
return allowCatchAll
|
|
}
|
|
|
|
// serverNameFromContext extracts the current server name from the context.
|
|
// Returns "UNKNOWN" if none is available (should probably never happen).
|
|
func serverNameFromContext(ctx context.Context) string {
|
|
srv, ok := ctx.Value(ServerCtxKey).(*Server)
|
|
if !ok || srv == nil || srv.name == "" {
|
|
return "UNKNOWN"
|
|
}
|
|
return srv.name
|
|
}
|
|
|
|
// metricsInstrumentedRoute wraps a compiled route Handler with metrics
|
|
// instrumentation. It wraps the entire compiled route chain once,
|
|
// collecting metrics only once per route match.
|
|
type metricsInstrumentedRoute struct {
|
|
handler string
|
|
next Handler
|
|
metrics *Metrics
|
|
}
|
|
|
|
func newMetricsInstrumentedRoute(ctx caddy.Context, handler string, next Handler, m *Metrics) *metricsInstrumentedRoute {
|
|
m.init.Do(func() {
|
|
initHTTPMetrics(ctx, m)
|
|
})
|
|
|
|
return &metricsInstrumentedRoute{handler: handler, next: next, metrics: m}
|
|
}
|
|
|
|
func (h *metricsInstrumentedRoute) ServeHTTP(w http.ResponseWriter, r *http.Request) error {
|
|
server := serverNameFromContext(r.Context())
|
|
labels := prometheus.Labels{"server": server, "handler": h.handler}
|
|
method := metrics.SanitizeMethod(r.Method)
|
|
// the "code" value is set later, but initialized here to eliminate the possibility
|
|
// of a panic
|
|
statusLabels := prometheus.Labels{"server": server, "handler": h.handler, "method": method, "code": ""}
|
|
|
|
// Determine if this is an HTTPS request
|
|
isHTTPS := r.TLS != nil
|
|
|
|
if h.metrics.PerHost {
|
|
// Apply cardinality protection for host metrics
|
|
if h.metrics.shouldAllowHostMetrics(r.Host, isHTTPS) {
|
|
labels["host"] = strings.ToLower(r.Host)
|
|
statusLabels["host"] = strings.ToLower(r.Host)
|
|
} else {
|
|
// Use a catch-all label for unallowed hosts to prevent cardinality explosion
|
|
labels["host"] = "_other"
|
|
statusLabels["host"] = "_other"
|
|
}
|
|
}
|
|
|
|
inFlight := h.metrics.httpMetrics.requestInFlight.With(labels)
|
|
inFlight.Inc()
|
|
defer inFlight.Dec()
|
|
|
|
start := time.Now()
|
|
|
|
// This is a _bit_ of a hack - it depends on the ShouldBufferFunc always
|
|
// being called when the headers are written.
|
|
// Effectively the same behaviour as promhttp.InstrumentHandlerTimeToWriteHeader.
|
|
writeHeaderRecorder := ShouldBufferFunc(func(status int, header http.Header) bool {
|
|
statusLabels["code"] = metrics.SanitizeCode(status)
|
|
ttfb := time.Since(start).Seconds()
|
|
h.metrics.httpMetrics.responseDuration.With(statusLabels).Observe(ttfb)
|
|
return false
|
|
})
|
|
wrec := NewResponseRecorder(w, nil, writeHeaderRecorder)
|
|
err := h.next.ServeHTTP(wrec, r)
|
|
dur := time.Since(start).Seconds()
|
|
h.metrics.httpMetrics.requestCount.With(labels).Inc()
|
|
|
|
observeRequest := func(status int) {
|
|
// If the code hasn't been set yet, and we didn't encounter an error, we're
|
|
// probably falling through with an empty handler.
|
|
if statusLabels["code"] == "" {
|
|
// we still sanitize it, even though it's likely to be 0. A 200 is
|
|
// returned on fallthrough so we want to reflect that.
|
|
statusLabels["code"] = metrics.SanitizeCode(status)
|
|
}
|
|
|
|
h.metrics.httpMetrics.requestDuration.With(statusLabels).Observe(dur)
|
|
h.metrics.httpMetrics.requestSize.With(statusLabels).Observe(float64(computeApproximateRequestSize(r)))
|
|
h.metrics.httpMetrics.responseSize.With(statusLabels).Observe(float64(wrec.Size()))
|
|
}
|
|
|
|
if err != nil {
|
|
var handlerErr HandlerError
|
|
if errors.As(err, &handlerErr) {
|
|
observeRequest(handlerErr.StatusCode)
|
|
}
|
|
|
|
h.metrics.httpMetrics.requestErrors.With(labels).Inc()
|
|
|
|
return err
|
|
}
|
|
|
|
observeRequest(wrec.Status())
|
|
|
|
return nil
|
|
}
|
|
|
|
// taken from https://github.com/prometheus/client_golang/blob/6007b2b5cae01203111de55f753e76d8dac1f529/prometheus/promhttp/instrument_server.go#L298
|
|
func computeApproximateRequestSize(r *http.Request) int {
|
|
s := 0
|
|
if r.URL != nil {
|
|
s += len(r.URL.String())
|
|
}
|
|
|
|
s += len(r.Method)
|
|
s += len(r.Proto)
|
|
for name, values := range r.Header {
|
|
s += len(name)
|
|
for _, value := range values {
|
|
s += len(value)
|
|
}
|
|
}
|
|
s += len(r.Host)
|
|
|
|
// N.B. r.Form and r.MultipartForm are assumed to be included in r.URL.
|
|
|
|
if r.ContentLength != -1 {
|
|
s += int(r.ContentLength)
|
|
}
|
|
return s
|
|
}
|