logging: Implement dial timeout for net writer (fix #4083) (#4172)

* logging: Implement dial timeout for net writer (fix #4083)

* Limit how often redials are attempted

This should cause dial blocking to occur only once every 10 seconds at most, but it also means the logger connection might be down for up to 10 seconds after it comes back online; oh well. We shouldn't block for DialTimeout at every single log emission.

* Clarify offline behavior
This commit is contained in:
Matt Holt 2021-05-19 15:14:03 -06:00 committed by GitHub
parent c052162203
commit e3c369d452
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -18,7 +18,9 @@ import (
"fmt" "fmt"
"io" "io"
"net" "net"
"os"
"sync" "sync"
"time"
"github.com/caddyserver/caddy/v2" "github.com/caddyserver/caddy/v2"
"github.com/caddyserver/caddy/v2/caddyconfig/caddyfile" "github.com/caddyserver/caddy/v2/caddyconfig/caddyfile"
@ -28,10 +30,16 @@ func init() {
caddy.RegisterModule(NetWriter{}) caddy.RegisterModule(NetWriter{})
} }
// NetWriter implements a log writer that outputs to a network socket. // NetWriter implements a log writer that outputs to a network socket. If
// the socket goes down, it will dump logs to stderr while it attempts to
// reconnect.
type NetWriter struct { type NetWriter struct {
// The address of the network socket to which to connect.
Address string `json:"address,omitempty"` Address string `json:"address,omitempty"`
// The timeout to wait while connecting to the socket.
DialTimeout caddy.Duration `json:"dial_timeout,omitempty"`
addr caddy.NetworkAddress addr caddy.NetworkAddress
} }
@ -60,6 +68,10 @@ func (nw *NetWriter) Provision(ctx caddy.Context) error {
return fmt.Errorf("multiple ports not supported") return fmt.Errorf("multiple ports not supported")
} }
if nw.DialTimeout < 0 {
return fmt.Errorf("timeout cannot be less than 0")
}
return nil return nil
} }
@ -74,7 +86,10 @@ func (nw NetWriter) WriterKey() string {
// OpenWriter opens a new network connection. // OpenWriter opens a new network connection.
func (nw NetWriter) OpenWriter() (io.WriteCloser, error) { func (nw NetWriter) OpenWriter() (io.WriteCloser, error) {
reconn := &redialerConn{nw: nw} reconn := &redialerConn{
nw: nw,
timeout: time.Duration(nw.DialTimeout),
}
conn, err := reconn.dial() conn, err := reconn.dial()
if err != nil { if err != nil {
return nil, err return nil, err
@ -87,7 +102,9 @@ func (nw NetWriter) OpenWriter() (io.WriteCloser, error) {
// UnmarshalCaddyfile sets up the handler from Caddyfile tokens. Syntax: // UnmarshalCaddyfile sets up the handler from Caddyfile tokens. Syntax:
// //
// net <address> // net <address> {
// dial_timeout <duration>
// }
// //
func (nw *NetWriter) UnmarshalCaddyfile(d *caddyfile.Dispenser) error { func (nw *NetWriter) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
for d.Next() { for d.Next() {
@ -98,6 +115,22 @@ func (nw *NetWriter) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
if d.NextArg() { if d.NextArg() {
return d.ArgErr() return d.ArgErr()
} }
for nesting := d.Nesting(); d.NextBlock(nesting); {
switch d.Val() {
case "dial_timeout":
if !d.NextArg() {
return d.ArgErr()
}
timeout, err := caddy.ParseDuration(d.Val())
if err != nil {
return d.Errf("invalid duration: %s", d.Val())
}
if d.NextArg() {
return d.ArgErr()
}
nw.DialTimeout = caddy.Duration(timeout)
}
}
} }
return nil return nil
} }
@ -107,8 +140,10 @@ func (nw *NetWriter) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
// is retried. // is retried.
type redialerConn struct { type redialerConn struct {
net.Conn net.Conn
connMu sync.RWMutex connMu sync.RWMutex
nw NetWriter nw NetWriter
timeout time.Duration
lastRedial time.Time
} }
// Write wraps the underlying Conn.Write method, but if that fails, // Write wraps the underlying Conn.Write method, but if that fails,
@ -131,20 +166,32 @@ func (reconn *redialerConn) Write(b []byte) (n int, err error) {
return return
} }
// we're the lucky first goroutine to re-dial the connection // there's still a problem, so try to re-attempt dialing the socket
conn2, err2 := reconn.dial() // if some time has passed in which the issue could have potentially
if err2 != nil { // been resolved - we don't want to block at every single log
return // emission (!) - see discussion in #4111
} if time.Since(reconn.lastRedial) > 10*time.Second {
if n, err = conn2.Write(b); err == nil { reconn.lastRedial = time.Now()
reconn.Conn.Close() conn2, err2 := reconn.dial()
reconn.Conn = conn2 if err2 != nil {
// logger socket still offline; instead of discarding the log, dump it to stderr
os.Stderr.Write(b)
return
}
if n, err = conn2.Write(b); err == nil {
reconn.Conn.Close()
reconn.Conn = conn2
}
} else {
// last redial attempt was too recent; just dump to stderr for now
os.Stderr.Write(b)
} }
return return
} }
func (reconn *redialerConn) dial() (net.Conn, error) { func (reconn *redialerConn) dial() (net.Conn, error) {
return net.Dial(reconn.nw.addr.Network, reconn.nw.addr.JoinHostPort(0)) return net.DialTimeout(reconn.nw.addr.Network, reconn.nw.addr.JoinHostPort(0), reconn.timeout)
} }
// Interface guards // Interface guards