diff --git a/go/daemon/bootstrap.go b/go/daemon/bootstrap.go index 4fa5d4b..2871a38 100644 --- a/go/daemon/bootstrap.go +++ b/go/daemon/bootstrap.go @@ -1,10 +1,13 @@ package daemon import ( + "bytes" "encoding/json" + "errors" "fmt" "os" "path/filepath" + "reflect" "isle/bootstrap" "isle/garage/garagesrv" @@ -77,3 +80,49 @@ func coalesceDaemonConfigAndBootstrap( return hostBootstrap, nil } + +type bootstrapDiff struct { + hostsChanged bool + nebulaChanged bool + dnsChanged bool +} + +func calcBootstrapDiff( + daemonConfig Config, + prevBootstrap, nextBootstrap bootstrap.Bootstrap, +) ( + diff bootstrapDiff, err error, +) { + { + prevHash, prevErr := bootstrap.HostsHash(prevBootstrap.Hosts) + nextHash, nextErr := bootstrap.HostsHash(nextBootstrap.Hosts) + if err = errors.Join(prevErr, nextErr); err != nil { + err = fmt.Errorf("calculating host hashes: %w", err) + return + } + + diff.hostsChanged = bytes.Equal(prevHash, nextHash) + } + + { + prevNebulaConfig, prevErr := nebulaConfig(daemonConfig, prevBootstrap) + nextNebulaConfig, nextErr := nebulaConfig(daemonConfig, nextBootstrap) + if err = errors.Join(prevErr, nextErr); err != nil { + err = fmt.Errorf("calculating nebula config: %w", err) + return + } + + diff.nebulaChanged = !reflect.DeepEqual( + prevNebulaConfig, nextNebulaConfig, + ) + } + + { + diff.dnsChanged = reflect.DeepEqual( + dnsmasqConfig(daemonConfig, prevBootstrap), + dnsmasqConfig(daemonConfig, nextBootstrap), + ) + } + + return +} diff --git a/go/daemon/child_dnsmasq.go b/go/daemon/child_dnsmasq.go index 5188aca..5bbe8d7 100644 --- a/go/daemon/child_dnsmasq.go +++ b/go/daemon/child_dnsmasq.go @@ -12,16 +12,9 @@ import ( "dev.mediocregopher.com/mediocre-go-lib.git/mlog" ) -func dnsmasqPmuxProcConfig( - logger *mlog.Logger, - runtimeDirPath, binDirPath string, - hostBootstrap bootstrap.Bootstrap, - daemonConfig Config, -) ( - pmuxlib.ProcessConfig, error, -) { - confPath := filepath.Join(runtimeDirPath, "dnsmasq.conf") - +func dnsmasqConfig( + daemonConfig Config, hostBootstrap bootstrap.Bootstrap, +) dnsmasq.ConfData { hostsSlice := make([]dnsmasq.ConfDataHost, 0, len(hostBootstrap.Hosts)) for _, host := range hostBootstrap.Hosts { hostsSlice = append(hostsSlice, dnsmasq.ConfDataHost{ @@ -34,15 +27,48 @@ func dnsmasqPmuxProcConfig( return hostsSlice[i].IP < hostsSlice[j].IP }) - confData := dnsmasq.ConfData{ + return dnsmasq.ConfData{ Resolvers: daemonConfig.DNS.Resolvers, Domain: hostBootstrap.NetworkCreationParams.Domain, IP: hostBootstrap.ThisHost().IP().String(), Hosts: hostsSlice, } +} + +func dnsmasqWriteConfig( + runtimeDirPath string, + daemonConfig Config, + hostBootstrap bootstrap.Bootstrap, +) ( + string, error, +) { + var ( + confPath = filepath.Join(runtimeDirPath, "dnsmasq.conf") + confData = dnsmasqConfig(daemonConfig, hostBootstrap) + ) if err := dnsmasq.WriteConfFile(confPath, confData); err != nil { - return pmuxlib.ProcessConfig{}, fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err) + return "", fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err) + } + + return confPath, nil +} + +func dnsmasqPmuxProcConfig( + logger *mlog.Logger, + runtimeDirPath, binDirPath string, + daemonConfig Config, + hostBootstrap bootstrap.Bootstrap, +) ( + pmuxlib.ProcessConfig, error, +) { + confPath, err := dnsmasqWriteConfig( + runtimeDirPath, daemonConfig, hostBootstrap, + ) + if err != nil { + return pmuxlib.ProcessConfig{}, fmt.Errorf( + "writing dnsmasq config: %w", err, + ) } return pmuxlib.ProcessConfig{ diff --git a/go/daemon/child_nebula.go b/go/daemon/child_nebula.go index 0489186..cdab02d 100644 --- a/go/daemon/child_nebula.go +++ b/go/daemon/child_nebula.go @@ -46,14 +46,12 @@ func waitForNebula( return ctx.Err() } -func nebulaPmuxProcConfig( - runtimeDirPath, binDirPath string, +func nebulaConfig( daemonConfig Config, hostBootstrap bootstrap.Bootstrap, ) ( - pmuxlib.ProcessConfig, error, + map[string]any, error, ) { - var ( lighthouseHostIPs []string staticHostMap = map[string][]string{} @@ -72,23 +70,19 @@ func nebulaPmuxProcConfig( caCertPEM, err := hostBootstrap.CAPublicCredentials.Cert.Unwrap().MarshalToPEM() if err != nil { - return pmuxlib.ProcessConfig{}, fmt.Errorf( - "marshaling CA cert to PEM: :%w", err, - ) + return nil, fmt.Errorf("marshaling CA cert to PEM: :%w", err) } hostCertPEM, err := hostBootstrap.PublicCredentials.Cert.Unwrap().MarshalToPEM() if err != nil { - return pmuxlib.ProcessConfig{}, fmt.Errorf( - "marshaling host cert to PEM: :%w", err, - ) + return nil, fmt.Errorf("marshaling host cert to PEM: :%w", err) } hostKeyPEM := cert.MarshalX25519PrivateKey( hostBootstrap.PrivateCredentials.EncryptingPrivateKey.Bytes(), ) - config := map[string]interface{}{ + config := map[string]any{ "pki": map[string]string{ "ca": string(caCertPEM), "cert": string(hostCertPEM), @@ -99,7 +93,7 @@ func nebulaPmuxProcConfig( "punch": true, "respond": true, }, - "tun": map[string]interface{}{ + "tun": map[string]any{ "dev": daemonConfig.VPN.Tun.Device, }, "firewall": daemonConfig.VPN.Firewall, @@ -112,7 +106,7 @@ func nebulaPmuxProcConfig( "port": "0", } - config["lighthouse"] = map[string]interface{}{ + config["lighthouse"] = map[string]any{ "hosts": lighthouseHostIPs, } @@ -121,7 +115,9 @@ func nebulaPmuxProcConfig( _, port, err := net.SplitHostPort(publicAddr) if err != nil { - return pmuxlib.ProcessConfig{}, fmt.Errorf("parsing public address %q: %w", publicAddr, err) + return nil, fmt.Errorf( + "parsing public address %q: %w", publicAddr, err, + ) } config["listen"] = map[string]string{ @@ -129,16 +125,55 @@ func nebulaPmuxProcConfig( "port": port, } - config["lighthouse"] = map[string]interface{}{ + config["lighthouse"] = map[string]any{ "hosts": []string{}, "am_lighthouse": true, } } + return config, nil +} + +func nebulaWriteConfig( + runtimeDirPath string, + daemonConfig Config, + hostBootstrap bootstrap.Bootstrap, +) ( + string, error, +) { + config, err := nebulaConfig(daemonConfig, hostBootstrap) + if err != nil { + return "", fmt.Errorf("creating nebula config: %w", err) + } + nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml") if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil { - return pmuxlib.ProcessConfig{}, fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err) + return "", fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err) + } + + return nebulaYmlPath, nil +} + +func nebulaPmuxProcConfig( + runtimeDirPath, binDirPath string, + daemonConfig Config, + hostBootstrap bootstrap.Bootstrap, +) ( + pmuxlib.ProcessConfig, error, +) { + config, err := nebulaConfig(daemonConfig, hostBootstrap) + if err != nil { + return pmuxlib.ProcessConfig{}, fmt.Errorf( + "creating nebula config: %w", err, + ) + } + + nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml") + if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil { + return pmuxlib.ProcessConfig{}, fmt.Errorf( + "writing nebula.yml to %q: %w", nebulaYmlPath, err, + ) } return pmuxlib.ProcessConfig{ diff --git a/go/daemon/child_pmux.go b/go/daemon/child_pmux.go index ff7c7ef..9be0946 100644 --- a/go/daemon/child_pmux.go +++ b/go/daemon/child_pmux.go @@ -31,8 +31,8 @@ func (c *Children) newPmuxConfig( c.logger, c.opts.EnvVars.RuntimeDirPath, binDirPath, - hostBootstrap, daemonConfig, + hostBootstrap, ) if err != nil { return pmuxlib.Config{}, fmt.Errorf( diff --git a/go/daemon/children.go b/go/daemon/children.go index 9aa3d4f..9bc6aab 100644 --- a/go/daemon/children.go +++ b/go/daemon/children.go @@ -17,8 +17,9 @@ import ( // - dnsmasq // - garage (0 or more, depending on configured storage allocations) type Children struct { - logger *mlog.Logger - opts Opts + logger *mlog.Logger + daemonConfig Config + opts Opts pmux *pmuxlib.Pmux } @@ -46,8 +47,9 @@ func NewChildren( } c := &Children{ - logger: logger, - opts: *opts, + logger: logger, + daemonConfig: daemonConfig, + opts: *opts, } pmuxConfig, err := c.newPmuxConfig( @@ -84,6 +86,32 @@ func NewChildren( return c, nil } +// RestartDNSMasq rewrites the dnsmasq config and restarts the process. +func (c *Children) RestartDNSMasq(hostBootstrap bootstrap.Bootstrap) error { + _, err := dnsmasqWriteConfig( + c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap, + ) + if err != nil { + return fmt.Errorf("writing new dnsmasq config: %w", err) + } + + c.pmux.Restart("dnsmasq") + return nil +} + +// RestartNebula rewrites the nebula config and restarts the process. +func (c *Children) RestartNebula(hostBootstrap bootstrap.Bootstrap) error { + _, err := nebulaWriteConfig( + c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap, + ) + if err != nil { + return fmt.Errorf("writing a new nebula config: %w", err) + } + + c.pmux.Restart("nebula") + return nil +} + // Shutdown blocks until all child processes have gracefully shut themselves // down. // diff --git a/go/daemon/daemon.go b/go/daemon/daemon.go index 376d75b..c458843 100644 --- a/go/daemon/daemon.go +++ b/go/daemon/daemon.go @@ -3,7 +3,6 @@ package daemon import ( - "bytes" "context" "errors" "fmt" @@ -146,7 +145,6 @@ const ( daemonStateNoNetwork = iota daemonStateInitializing daemonStateOk - daemonStateRestarting daemonStateShutdown ) @@ -182,10 +180,6 @@ type daemon struct { // While still starting up the Daemon for the first time all methods will return // ErrInitializing, except Shutdown which will block until initialization is // canceled. -// -// TODO make daemon smarter, it currently restarts on _any_ change, but -// it should restart itself only when there's something actually requiring a -// restart. func NewDaemon( logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts, ) ( @@ -294,8 +288,6 @@ func withCurrBootstrap[Res any]( return zero, ErrInitializing case daemonStateOk: return fn(currBootstrap) - case daemonStateRestarting: - return zero, ErrRestarting case daemonStateShutdown: return zero, errors.New("already shutdown") default: @@ -303,80 +295,49 @@ func withCurrBootstrap[Res any]( } } -// creates a new bootstrap file using available information from the network. If -// the new bootstrap file is different than the existing one, the existing one -// is overwritten and true is returned. -func (d *daemon) checkBootstrap( - ctx context.Context, hostBootstrap bootstrap.Bootstrap, -) ( - bootstrap.Bootstrap, bool, error, -) { +func (d *daemon) reload( + ctx context.Context, newHosts map[nebula.HostName]bootstrap.Host, +) error { + var ( + newBootstrap = d.currBootstrap + thisHost = d.currBootstrap.ThisHost() + ) - thisHost := hostBootstrap.ThisHost() - - newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, hostBootstrap) - if err != nil { - return bootstrap.Bootstrap{}, false, fmt.Errorf("getting hosts from garage: %w", err) - } + newBootstrap.Hosts = newHosts // the daemon's view of this host's bootstrap info takes precedence over // whatever is in garage - newHosts[thisHost.Name] = thisHost + newBootstrap.Hosts[thisHost.Name] = thisHost - newHostsHash, err := bootstrap.HostsHash(newHosts) + diff, err := calcBootstrapDiff(d.daemonConfig, d.currBootstrap, newBootstrap) if err != nil { - return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of new hosts: %w", err) + return fmt.Errorf("calculating diff between bootstraps: %w", err) + } else if diff == (bootstrapDiff{}) { + return nil } - currHostsHash, err := bootstrap.HostsHash(hostBootstrap.Hosts) - if err != nil { - return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of current hosts: %w", err) - } + d.logger.Info(ctx, "Bootstrap has changed, storing new bootstrap") + d.l.Lock() + d.currBootstrap = newBootstrap + d.l.Unlock() - if bytes.Equal(newHostsHash, currHostsHash) { - return hostBootstrap, false, nil - } + var errs []error - hostBootstrap.Hosts = newHosts - - return hostBootstrap, true, nil -} - -// blocks until a new bootstrap is available or context is canceled -func (d *daemon) watchForChanges(ctx context.Context) bootstrap.Bootstrap { - - ticker := time.NewTicker(3 * time.Minute) - defer ticker.Stop() - - for { - select { - - case <-ctx.Done(): - return bootstrap.Bootstrap{} - - case <-ticker.C: - - d.logger.Info(ctx, "Checking for changes to bootstrap") - - newBootstrap, changed, err := d.checkBootstrap( - ctx, d.currBootstrap, - ) - if err != nil { - d.logger.Error(ctx, "Checking bootstrap for changes failed", err) - continue - } else if !changed { - continue - } - - err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, newBootstrap) - if err != nil { - d.logger.Error(ctx, "Writing new bootstrap to disk failed", err) - continue - } - - return newBootstrap + if diff.dnsChanged { + d.logger.Info(ctx, "Restarting dnsmasq to account for bootstrap changes") + if err := d.children.RestartDNSMasq(newBootstrap); err != nil { + errs = append(errs, fmt.Errorf("restarting dnsmasq: %w", err)) } } + + if diff.nebulaChanged { + d.logger.Info(ctx, "Restarting nebula to account for bootstrap changes") + if err := d.children.RestartNebula(newBootstrap); err != nil { + errs = append(errs, fmt.Errorf("restarting nebula: %w", err)) + } + } + + return errors.Join(errs...) } func (d *daemon) postInit(ctx context.Context) bool { @@ -450,7 +411,10 @@ func (d *daemon) postInit(ctx context.Context) bool { return true } -func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) { +func (d *daemon) restartLoop( + ctx context.Context, + readyCh chan<- struct{}, +) { wait := func(d time.Duration) bool { select { case <-ctx.Done(): @@ -465,15 +429,20 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) { case readyCh <- struct{}{}: default: } + readyCh = nil } - for { - if ctx.Err() != nil { - return - } + var ( + children *Children + err error + ) + for { d.logger.Info(ctx, "Creating child processes") - children, err := NewChildren( + // TODO this could probably get moved outside of restartLoop, and into + // initialize. If it fails the error can get passed up to the caller and + // no changes can be made. + children, err = NewChildren( ctx, d.logger.WithNamespace("children"), d.envBinDirPath, @@ -508,27 +477,33 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) { d.l.Unlock() ready() + break + } - newBootstrap := d.watchForChanges(ctx) - if ctx.Err() != nil { + ticker := time.NewTicker(3 * time.Minute) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): return + + case <-ticker.C: + + d.logger.Info(ctx, "Checking for bootstrap changes") + newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, d.currBootstrap) + if err != nil { + d.logger.Error(ctx, "Failed to get hosts from garage", err) + continue + } + + if err := d.reload(ctx, newHosts); err != nil { + d.logger.Error(ctx, "Reloading with new host data failed", err) + continue + } + + ready() } - - d.logger.Info(ctx, "Bootstrap has changed, will restart daemon") - d.l.Lock() - d.currBootstrap = newBootstrap - d.state = daemonStateRestarting - d.l.Unlock() - - d.logger.Info(ctx, "Shutting down previous child processes") - if err := d.children.Shutdown(); err != nil { - d.logger.Fatal(ctx, "Failed to cleanly shutdown children, there may be orphaned child processes", err) - } - - // in case context was canceled while shutting the Children down, we - // don't want the Shutdown method to re-attempt calling Shutdown on - // it. - d.children = nil } } diff --git a/go/daemon/errors.go b/go/daemon/errors.go index d3ac064..6e27e5e 100644 --- a/go/daemon/errors.go +++ b/go/daemon/errors.go @@ -11,10 +11,6 @@ var ( // being initialized. ErrInitializing = jsonrpc2.NewError(2, "Network is being initialized") - // ErrRestarting is returned when a network is unavailable due to being - // restarted. - ErrRestarting = jsonrpc2.NewError(3, "Network is being restarted") - // ErrAlreadyJoined is returned when the daemon is instructed to create or // join a new network, but it is already joined to a network. ErrAlreadyJoined = jsonrpc2.NewError(4, "Already joined to a network")