Only restart sub-processes which need restarting on bootstrap changes

This commit is contained in:
Brian Picciano 2024-07-19 20:49:04 +02:00
parent bc9a2b62ef
commit 7aa11ebe29
7 changed files with 240 additions and 131 deletions

View File

@ -1,10 +1,13 @@
package daemon package daemon
import ( import (
"bytes"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"reflect"
"isle/bootstrap" "isle/bootstrap"
"isle/garage/garagesrv" "isle/garage/garagesrv"
@ -77,3 +80,49 @@ func coalesceDaemonConfigAndBootstrap(
return hostBootstrap, nil return hostBootstrap, nil
} }
type bootstrapDiff struct {
hostsChanged bool
nebulaChanged bool
dnsChanged bool
}
func calcBootstrapDiff(
daemonConfig Config,
prevBootstrap, nextBootstrap bootstrap.Bootstrap,
) (
diff bootstrapDiff, err error,
) {
{
prevHash, prevErr := bootstrap.HostsHash(prevBootstrap.Hosts)
nextHash, nextErr := bootstrap.HostsHash(nextBootstrap.Hosts)
if err = errors.Join(prevErr, nextErr); err != nil {
err = fmt.Errorf("calculating host hashes: %w", err)
return
}
diff.hostsChanged = bytes.Equal(prevHash, nextHash)
}
{
prevNebulaConfig, prevErr := nebulaConfig(daemonConfig, prevBootstrap)
nextNebulaConfig, nextErr := nebulaConfig(daemonConfig, nextBootstrap)
if err = errors.Join(prevErr, nextErr); err != nil {
err = fmt.Errorf("calculating nebula config: %w", err)
return
}
diff.nebulaChanged = !reflect.DeepEqual(
prevNebulaConfig, nextNebulaConfig,
)
}
{
diff.dnsChanged = reflect.DeepEqual(
dnsmasqConfig(daemonConfig, prevBootstrap),
dnsmasqConfig(daemonConfig, nextBootstrap),
)
}
return
}

View File

@ -12,16 +12,9 @@ import (
"dev.mediocregopher.com/mediocre-go-lib.git/mlog" "dev.mediocregopher.com/mediocre-go-lib.git/mlog"
) )
func dnsmasqPmuxProcConfig( func dnsmasqConfig(
logger *mlog.Logger, daemonConfig Config, hostBootstrap bootstrap.Bootstrap,
runtimeDirPath, binDirPath string, ) dnsmasq.ConfData {
hostBootstrap bootstrap.Bootstrap,
daemonConfig Config,
) (
pmuxlib.ProcessConfig, error,
) {
confPath := filepath.Join(runtimeDirPath, "dnsmasq.conf")
hostsSlice := make([]dnsmasq.ConfDataHost, 0, len(hostBootstrap.Hosts)) hostsSlice := make([]dnsmasq.ConfDataHost, 0, len(hostBootstrap.Hosts))
for _, host := range hostBootstrap.Hosts { for _, host := range hostBootstrap.Hosts {
hostsSlice = append(hostsSlice, dnsmasq.ConfDataHost{ hostsSlice = append(hostsSlice, dnsmasq.ConfDataHost{
@ -34,15 +27,48 @@ func dnsmasqPmuxProcConfig(
return hostsSlice[i].IP < hostsSlice[j].IP return hostsSlice[i].IP < hostsSlice[j].IP
}) })
confData := dnsmasq.ConfData{ return dnsmasq.ConfData{
Resolvers: daemonConfig.DNS.Resolvers, Resolvers: daemonConfig.DNS.Resolvers,
Domain: hostBootstrap.NetworkCreationParams.Domain, Domain: hostBootstrap.NetworkCreationParams.Domain,
IP: hostBootstrap.ThisHost().IP().String(), IP: hostBootstrap.ThisHost().IP().String(),
Hosts: hostsSlice, Hosts: hostsSlice,
} }
}
func dnsmasqWriteConfig(
runtimeDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
string, error,
) {
var (
confPath = filepath.Join(runtimeDirPath, "dnsmasq.conf")
confData = dnsmasqConfig(daemonConfig, hostBootstrap)
)
if err := dnsmasq.WriteConfFile(confPath, confData); err != nil { if err := dnsmasq.WriteConfFile(confPath, confData); err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err) return "", fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err)
}
return confPath, nil
}
func dnsmasqPmuxProcConfig(
logger *mlog.Logger,
runtimeDirPath, binDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
pmuxlib.ProcessConfig, error,
) {
confPath, err := dnsmasqWriteConfig(
runtimeDirPath, daemonConfig, hostBootstrap,
)
if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"writing dnsmasq config: %w", err,
)
} }
return pmuxlib.ProcessConfig{ return pmuxlib.ProcessConfig{

View File

@ -46,14 +46,12 @@ func waitForNebula(
return ctx.Err() return ctx.Err()
} }
func nebulaPmuxProcConfig( func nebulaConfig(
runtimeDirPath, binDirPath string,
daemonConfig Config, daemonConfig Config,
hostBootstrap bootstrap.Bootstrap, hostBootstrap bootstrap.Bootstrap,
) ( ) (
pmuxlib.ProcessConfig, error, map[string]any, error,
) { ) {
var ( var (
lighthouseHostIPs []string lighthouseHostIPs []string
staticHostMap = map[string][]string{} staticHostMap = map[string][]string{}
@ -72,23 +70,19 @@ func nebulaPmuxProcConfig(
caCertPEM, err := hostBootstrap.CAPublicCredentials.Cert.Unwrap().MarshalToPEM() caCertPEM, err := hostBootstrap.CAPublicCredentials.Cert.Unwrap().MarshalToPEM()
if err != nil { if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf( return nil, fmt.Errorf("marshaling CA cert to PEM: :%w", err)
"marshaling CA cert to PEM: :%w", err,
)
} }
hostCertPEM, err := hostBootstrap.PublicCredentials.Cert.Unwrap().MarshalToPEM() hostCertPEM, err := hostBootstrap.PublicCredentials.Cert.Unwrap().MarshalToPEM()
if err != nil { if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf( return nil, fmt.Errorf("marshaling host cert to PEM: :%w", err)
"marshaling host cert to PEM: :%w", err,
)
} }
hostKeyPEM := cert.MarshalX25519PrivateKey( hostKeyPEM := cert.MarshalX25519PrivateKey(
hostBootstrap.PrivateCredentials.EncryptingPrivateKey.Bytes(), hostBootstrap.PrivateCredentials.EncryptingPrivateKey.Bytes(),
) )
config := map[string]interface{}{ config := map[string]any{
"pki": map[string]string{ "pki": map[string]string{
"ca": string(caCertPEM), "ca": string(caCertPEM),
"cert": string(hostCertPEM), "cert": string(hostCertPEM),
@ -99,7 +93,7 @@ func nebulaPmuxProcConfig(
"punch": true, "punch": true,
"respond": true, "respond": true,
}, },
"tun": map[string]interface{}{ "tun": map[string]any{
"dev": daemonConfig.VPN.Tun.Device, "dev": daemonConfig.VPN.Tun.Device,
}, },
"firewall": daemonConfig.VPN.Firewall, "firewall": daemonConfig.VPN.Firewall,
@ -112,7 +106,7 @@ func nebulaPmuxProcConfig(
"port": "0", "port": "0",
} }
config["lighthouse"] = map[string]interface{}{ config["lighthouse"] = map[string]any{
"hosts": lighthouseHostIPs, "hosts": lighthouseHostIPs,
} }
@ -121,7 +115,9 @@ func nebulaPmuxProcConfig(
_, port, err := net.SplitHostPort(publicAddr) _, port, err := net.SplitHostPort(publicAddr)
if err != nil { if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf("parsing public address %q: %w", publicAddr, err) return nil, fmt.Errorf(
"parsing public address %q: %w", publicAddr, err,
)
} }
config["listen"] = map[string]string{ config["listen"] = map[string]string{
@ -129,16 +125,55 @@ func nebulaPmuxProcConfig(
"port": port, "port": port,
} }
config["lighthouse"] = map[string]interface{}{ config["lighthouse"] = map[string]any{
"hosts": []string{}, "hosts": []string{},
"am_lighthouse": true, "am_lighthouse": true,
} }
} }
return config, nil
}
func nebulaWriteConfig(
runtimeDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
string, error,
) {
config, err := nebulaConfig(daemonConfig, hostBootstrap)
if err != nil {
return "", fmt.Errorf("creating nebula config: %w", err)
}
nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml") nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml")
if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil { if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err) return "", fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err)
}
return nebulaYmlPath, nil
}
func nebulaPmuxProcConfig(
runtimeDirPath, binDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
pmuxlib.ProcessConfig, error,
) {
config, err := nebulaConfig(daemonConfig, hostBootstrap)
if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"creating nebula config: %w", err,
)
}
nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml")
if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"writing nebula.yml to %q: %w", nebulaYmlPath, err,
)
} }
return pmuxlib.ProcessConfig{ return pmuxlib.ProcessConfig{

View File

@ -31,8 +31,8 @@ func (c *Children) newPmuxConfig(
c.logger, c.logger,
c.opts.EnvVars.RuntimeDirPath, c.opts.EnvVars.RuntimeDirPath,
binDirPath, binDirPath,
hostBootstrap,
daemonConfig, daemonConfig,
hostBootstrap,
) )
if err != nil { if err != nil {
return pmuxlib.Config{}, fmt.Errorf( return pmuxlib.Config{}, fmt.Errorf(

View File

@ -17,8 +17,9 @@ import (
// - dnsmasq // - dnsmasq
// - garage (0 or more, depending on configured storage allocations) // - garage (0 or more, depending on configured storage allocations)
type Children struct { type Children struct {
logger *mlog.Logger logger *mlog.Logger
opts Opts daemonConfig Config
opts Opts
pmux *pmuxlib.Pmux pmux *pmuxlib.Pmux
} }
@ -46,8 +47,9 @@ func NewChildren(
} }
c := &Children{ c := &Children{
logger: logger, logger: logger,
opts: *opts, daemonConfig: daemonConfig,
opts: *opts,
} }
pmuxConfig, err := c.newPmuxConfig( pmuxConfig, err := c.newPmuxConfig(
@ -84,6 +86,32 @@ func NewChildren(
return c, nil return c, nil
} }
// RestartDNSMasq rewrites the dnsmasq config and restarts the process.
func (c *Children) RestartDNSMasq(hostBootstrap bootstrap.Bootstrap) error {
_, err := dnsmasqWriteConfig(
c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap,
)
if err != nil {
return fmt.Errorf("writing new dnsmasq config: %w", err)
}
c.pmux.Restart("dnsmasq")
return nil
}
// RestartNebula rewrites the nebula config and restarts the process.
func (c *Children) RestartNebula(hostBootstrap bootstrap.Bootstrap) error {
_, err := nebulaWriteConfig(
c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap,
)
if err != nil {
return fmt.Errorf("writing a new nebula config: %w", err)
}
c.pmux.Restart("nebula")
return nil
}
// Shutdown blocks until all child processes have gracefully shut themselves // Shutdown blocks until all child processes have gracefully shut themselves
// down. // down.
// //

View File

@ -3,7 +3,6 @@
package daemon package daemon
import ( import (
"bytes"
"context" "context"
"errors" "errors"
"fmt" "fmt"
@ -146,7 +145,6 @@ const (
daemonStateNoNetwork = iota daemonStateNoNetwork = iota
daemonStateInitializing daemonStateInitializing
daemonStateOk daemonStateOk
daemonStateRestarting
daemonStateShutdown daemonStateShutdown
) )
@ -182,10 +180,6 @@ type daemon struct {
// While still starting up the Daemon for the first time all methods will return // While still starting up the Daemon for the first time all methods will return
// ErrInitializing, except Shutdown which will block until initialization is // ErrInitializing, except Shutdown which will block until initialization is
// canceled. // canceled.
//
// TODO make daemon smarter, it currently restarts on _any_ change, but
// it should restart itself only when there's something actually requiring a
// restart.
func NewDaemon( func NewDaemon(
logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts, logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts,
) ( ) (
@ -294,8 +288,6 @@ func withCurrBootstrap[Res any](
return zero, ErrInitializing return zero, ErrInitializing
case daemonStateOk: case daemonStateOk:
return fn(currBootstrap) return fn(currBootstrap)
case daemonStateRestarting:
return zero, ErrRestarting
case daemonStateShutdown: case daemonStateShutdown:
return zero, errors.New("already shutdown") return zero, errors.New("already shutdown")
default: default:
@ -303,80 +295,49 @@ func withCurrBootstrap[Res any](
} }
} }
// creates a new bootstrap file using available information from the network. If func (d *daemon) reload(
// the new bootstrap file is different than the existing one, the existing one ctx context.Context, newHosts map[nebula.HostName]bootstrap.Host,
// is overwritten and true is returned. ) error {
func (d *daemon) checkBootstrap( var (
ctx context.Context, hostBootstrap bootstrap.Bootstrap, newBootstrap = d.currBootstrap
) ( thisHost = d.currBootstrap.ThisHost()
bootstrap.Bootstrap, bool, error, )
) {
thisHost := hostBootstrap.ThisHost() newBootstrap.Hosts = newHosts
newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, hostBootstrap)
if err != nil {
return bootstrap.Bootstrap{}, false, fmt.Errorf("getting hosts from garage: %w", err)
}
// the daemon's view of this host's bootstrap info takes precedence over // the daemon's view of this host's bootstrap info takes precedence over
// whatever is in garage // whatever is in garage
newHosts[thisHost.Name] = thisHost newBootstrap.Hosts[thisHost.Name] = thisHost
newHostsHash, err := bootstrap.HostsHash(newHosts) diff, err := calcBootstrapDiff(d.daemonConfig, d.currBootstrap, newBootstrap)
if err != nil { if err != nil {
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of new hosts: %w", err) return fmt.Errorf("calculating diff between bootstraps: %w", err)
} else if diff == (bootstrapDiff{}) {
return nil
} }
currHostsHash, err := bootstrap.HostsHash(hostBootstrap.Hosts) d.logger.Info(ctx, "Bootstrap has changed, storing new bootstrap")
if err != nil { d.l.Lock()
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of current hosts: %w", err) d.currBootstrap = newBootstrap
} d.l.Unlock()
if bytes.Equal(newHostsHash, currHostsHash) { var errs []error
return hostBootstrap, false, nil
}
hostBootstrap.Hosts = newHosts if diff.dnsChanged {
d.logger.Info(ctx, "Restarting dnsmasq to account for bootstrap changes")
return hostBootstrap, true, nil if err := d.children.RestartDNSMasq(newBootstrap); err != nil {
} errs = append(errs, fmt.Errorf("restarting dnsmasq: %w", err))
// blocks until a new bootstrap is available or context is canceled
func (d *daemon) watchForChanges(ctx context.Context) bootstrap.Bootstrap {
ticker := time.NewTicker(3 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return bootstrap.Bootstrap{}
case <-ticker.C:
d.logger.Info(ctx, "Checking for changes to bootstrap")
newBootstrap, changed, err := d.checkBootstrap(
ctx, d.currBootstrap,
)
if err != nil {
d.logger.Error(ctx, "Checking bootstrap for changes failed", err)
continue
} else if !changed {
continue
}
err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, newBootstrap)
if err != nil {
d.logger.Error(ctx, "Writing new bootstrap to disk failed", err)
continue
}
return newBootstrap
} }
} }
if diff.nebulaChanged {
d.logger.Info(ctx, "Restarting nebula to account for bootstrap changes")
if err := d.children.RestartNebula(newBootstrap); err != nil {
errs = append(errs, fmt.Errorf("restarting nebula: %w", err))
}
}
return errors.Join(errs...)
} }
func (d *daemon) postInit(ctx context.Context) bool { func (d *daemon) postInit(ctx context.Context) bool {
@ -450,7 +411,10 @@ func (d *daemon) postInit(ctx context.Context) bool {
return true return true
} }
func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) { func (d *daemon) restartLoop(
ctx context.Context,
readyCh chan<- struct{},
) {
wait := func(d time.Duration) bool { wait := func(d time.Duration) bool {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -465,15 +429,20 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
case readyCh <- struct{}{}: case readyCh <- struct{}{}:
default: default:
} }
readyCh = nil
} }
for { var (
if ctx.Err() != nil { children *Children
return err error
} )
for {
d.logger.Info(ctx, "Creating child processes") d.logger.Info(ctx, "Creating child processes")
children, err := NewChildren( // TODO this could probably get moved outside of restartLoop, and into
// initialize. If it fails the error can get passed up to the caller and
// no changes can be made.
children, err = NewChildren(
ctx, ctx,
d.logger.WithNamespace("children"), d.logger.WithNamespace("children"),
d.envBinDirPath, d.envBinDirPath,
@ -508,27 +477,33 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
d.l.Unlock() d.l.Unlock()
ready() ready()
break
}
newBootstrap := d.watchForChanges(ctx) ticker := time.NewTicker(3 * time.Minute)
if ctx.Err() != nil { defer ticker.Stop()
for {
select {
case <-ctx.Done():
return return
case <-ticker.C:
d.logger.Info(ctx, "Checking for bootstrap changes")
newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, d.currBootstrap)
if err != nil {
d.logger.Error(ctx, "Failed to get hosts from garage", err)
continue
}
if err := d.reload(ctx, newHosts); err != nil {
d.logger.Error(ctx, "Reloading with new host data failed", err)
continue
}
ready()
} }
d.logger.Info(ctx, "Bootstrap has changed, will restart daemon")
d.l.Lock()
d.currBootstrap = newBootstrap
d.state = daemonStateRestarting
d.l.Unlock()
d.logger.Info(ctx, "Shutting down previous child processes")
if err := d.children.Shutdown(); err != nil {
d.logger.Fatal(ctx, "Failed to cleanly shutdown children, there may be orphaned child processes", err)
}
// in case context was canceled while shutting the Children down, we
// don't want the Shutdown method to re-attempt calling Shutdown on
// it.
d.children = nil
} }
} }

View File

@ -11,10 +11,6 @@ var (
// being initialized. // being initialized.
ErrInitializing = jsonrpc2.NewError(2, "Network is being initialized") ErrInitializing = jsonrpc2.NewError(2, "Network is being initialized")
// ErrRestarting is returned when a network is unavailable due to being
// restarted.
ErrRestarting = jsonrpc2.NewError(3, "Network is being restarted")
// ErrAlreadyJoined is returned when the daemon is instructed to create or // ErrAlreadyJoined is returned when the daemon is instructed to create or
// join a new network, but it is already joined to a network. // join a new network, but it is already joined to a network.
ErrAlreadyJoined = jsonrpc2.NewError(4, "Already joined to a network") ErrAlreadyJoined = jsonrpc2.NewError(4, "Already joined to a network")