Only restart sub-processes which need restarting on bootstrap changes

This commit is contained in:
Brian Picciano 2024-07-19 20:49:04 +02:00
parent bc9a2b62ef
commit 7aa11ebe29
7 changed files with 240 additions and 131 deletions

View File

@ -1,10 +1,13 @@
package daemon
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"reflect"
"isle/bootstrap"
"isle/garage/garagesrv"
@ -77,3 +80,49 @@ func coalesceDaemonConfigAndBootstrap(
return hostBootstrap, nil
}
type bootstrapDiff struct {
hostsChanged bool
nebulaChanged bool
dnsChanged bool
}
func calcBootstrapDiff(
daemonConfig Config,
prevBootstrap, nextBootstrap bootstrap.Bootstrap,
) (
diff bootstrapDiff, err error,
) {
{
prevHash, prevErr := bootstrap.HostsHash(prevBootstrap.Hosts)
nextHash, nextErr := bootstrap.HostsHash(nextBootstrap.Hosts)
if err = errors.Join(prevErr, nextErr); err != nil {
err = fmt.Errorf("calculating host hashes: %w", err)
return
}
diff.hostsChanged = bytes.Equal(prevHash, nextHash)
}
{
prevNebulaConfig, prevErr := nebulaConfig(daemonConfig, prevBootstrap)
nextNebulaConfig, nextErr := nebulaConfig(daemonConfig, nextBootstrap)
if err = errors.Join(prevErr, nextErr); err != nil {
err = fmt.Errorf("calculating nebula config: %w", err)
return
}
diff.nebulaChanged = !reflect.DeepEqual(
prevNebulaConfig, nextNebulaConfig,
)
}
{
diff.dnsChanged = reflect.DeepEqual(
dnsmasqConfig(daemonConfig, prevBootstrap),
dnsmasqConfig(daemonConfig, nextBootstrap),
)
}
return
}

View File

@ -12,16 +12,9 @@ import (
"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
)
func dnsmasqPmuxProcConfig(
logger *mlog.Logger,
runtimeDirPath, binDirPath string,
hostBootstrap bootstrap.Bootstrap,
daemonConfig Config,
) (
pmuxlib.ProcessConfig, error,
) {
confPath := filepath.Join(runtimeDirPath, "dnsmasq.conf")
func dnsmasqConfig(
daemonConfig Config, hostBootstrap bootstrap.Bootstrap,
) dnsmasq.ConfData {
hostsSlice := make([]dnsmasq.ConfDataHost, 0, len(hostBootstrap.Hosts))
for _, host := range hostBootstrap.Hosts {
hostsSlice = append(hostsSlice, dnsmasq.ConfDataHost{
@ -34,15 +27,48 @@ func dnsmasqPmuxProcConfig(
return hostsSlice[i].IP < hostsSlice[j].IP
})
confData := dnsmasq.ConfData{
return dnsmasq.ConfData{
Resolvers: daemonConfig.DNS.Resolvers,
Domain: hostBootstrap.NetworkCreationParams.Domain,
IP: hostBootstrap.ThisHost().IP().String(),
Hosts: hostsSlice,
}
}
func dnsmasqWriteConfig(
runtimeDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
string, error,
) {
var (
confPath = filepath.Join(runtimeDirPath, "dnsmasq.conf")
confData = dnsmasqConfig(daemonConfig, hostBootstrap)
)
if err := dnsmasq.WriteConfFile(confPath, confData); err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err)
return "", fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err)
}
return confPath, nil
}
func dnsmasqPmuxProcConfig(
logger *mlog.Logger,
runtimeDirPath, binDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
pmuxlib.ProcessConfig, error,
) {
confPath, err := dnsmasqWriteConfig(
runtimeDirPath, daemonConfig, hostBootstrap,
)
if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"writing dnsmasq config: %w", err,
)
}
return pmuxlib.ProcessConfig{

View File

@ -46,14 +46,12 @@ func waitForNebula(
return ctx.Err()
}
func nebulaPmuxProcConfig(
runtimeDirPath, binDirPath string,
func nebulaConfig(
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
pmuxlib.ProcessConfig, error,
map[string]any, error,
) {
var (
lighthouseHostIPs []string
staticHostMap = map[string][]string{}
@ -72,23 +70,19 @@ func nebulaPmuxProcConfig(
caCertPEM, err := hostBootstrap.CAPublicCredentials.Cert.Unwrap().MarshalToPEM()
if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"marshaling CA cert to PEM: :%w", err,
)
return nil, fmt.Errorf("marshaling CA cert to PEM: :%w", err)
}
hostCertPEM, err := hostBootstrap.PublicCredentials.Cert.Unwrap().MarshalToPEM()
if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"marshaling host cert to PEM: :%w", err,
)
return nil, fmt.Errorf("marshaling host cert to PEM: :%w", err)
}
hostKeyPEM := cert.MarshalX25519PrivateKey(
hostBootstrap.PrivateCredentials.EncryptingPrivateKey.Bytes(),
)
config := map[string]interface{}{
config := map[string]any{
"pki": map[string]string{
"ca": string(caCertPEM),
"cert": string(hostCertPEM),
@ -99,7 +93,7 @@ func nebulaPmuxProcConfig(
"punch": true,
"respond": true,
},
"tun": map[string]interface{}{
"tun": map[string]any{
"dev": daemonConfig.VPN.Tun.Device,
},
"firewall": daemonConfig.VPN.Firewall,
@ -112,7 +106,7 @@ func nebulaPmuxProcConfig(
"port": "0",
}
config["lighthouse"] = map[string]interface{}{
config["lighthouse"] = map[string]any{
"hosts": lighthouseHostIPs,
}
@ -121,7 +115,9 @@ func nebulaPmuxProcConfig(
_, port, err := net.SplitHostPort(publicAddr)
if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf("parsing public address %q: %w", publicAddr, err)
return nil, fmt.Errorf(
"parsing public address %q: %w", publicAddr, err,
)
}
config["listen"] = map[string]string{
@ -129,16 +125,55 @@ func nebulaPmuxProcConfig(
"port": port,
}
config["lighthouse"] = map[string]interface{}{
config["lighthouse"] = map[string]any{
"hosts": []string{},
"am_lighthouse": true,
}
}
return config, nil
}
func nebulaWriteConfig(
runtimeDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
string, error,
) {
config, err := nebulaConfig(daemonConfig, hostBootstrap)
if err != nil {
return "", fmt.Errorf("creating nebula config: %w", err)
}
nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml")
if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err)
return "", fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err)
}
return nebulaYmlPath, nil
}
func nebulaPmuxProcConfig(
runtimeDirPath, binDirPath string,
daemonConfig Config,
hostBootstrap bootstrap.Bootstrap,
) (
pmuxlib.ProcessConfig, error,
) {
config, err := nebulaConfig(daemonConfig, hostBootstrap)
if err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"creating nebula config: %w", err,
)
}
nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml")
if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil {
return pmuxlib.ProcessConfig{}, fmt.Errorf(
"writing nebula.yml to %q: %w", nebulaYmlPath, err,
)
}
return pmuxlib.ProcessConfig{

View File

@ -31,8 +31,8 @@ func (c *Children) newPmuxConfig(
c.logger,
c.opts.EnvVars.RuntimeDirPath,
binDirPath,
hostBootstrap,
daemonConfig,
hostBootstrap,
)
if err != nil {
return pmuxlib.Config{}, fmt.Errorf(

View File

@ -17,8 +17,9 @@ import (
// - dnsmasq
// - garage (0 or more, depending on configured storage allocations)
type Children struct {
logger *mlog.Logger
opts Opts
logger *mlog.Logger
daemonConfig Config
opts Opts
pmux *pmuxlib.Pmux
}
@ -46,8 +47,9 @@ func NewChildren(
}
c := &Children{
logger: logger,
opts: *opts,
logger: logger,
daemonConfig: daemonConfig,
opts: *opts,
}
pmuxConfig, err := c.newPmuxConfig(
@ -84,6 +86,32 @@ func NewChildren(
return c, nil
}
// RestartDNSMasq rewrites the dnsmasq config and restarts the process.
func (c *Children) RestartDNSMasq(hostBootstrap bootstrap.Bootstrap) error {
_, err := dnsmasqWriteConfig(
c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap,
)
if err != nil {
return fmt.Errorf("writing new dnsmasq config: %w", err)
}
c.pmux.Restart("dnsmasq")
return nil
}
// RestartNebula rewrites the nebula config and restarts the process.
func (c *Children) RestartNebula(hostBootstrap bootstrap.Bootstrap) error {
_, err := nebulaWriteConfig(
c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap,
)
if err != nil {
return fmt.Errorf("writing a new nebula config: %w", err)
}
c.pmux.Restart("nebula")
return nil
}
// Shutdown blocks until all child processes have gracefully shut themselves
// down.
//

View File

@ -3,7 +3,6 @@
package daemon
import (
"bytes"
"context"
"errors"
"fmt"
@ -146,7 +145,6 @@ const (
daemonStateNoNetwork = iota
daemonStateInitializing
daemonStateOk
daemonStateRestarting
daemonStateShutdown
)
@ -182,10 +180,6 @@ type daemon struct {
// While still starting up the Daemon for the first time all methods will return
// ErrInitializing, except Shutdown which will block until initialization is
// canceled.
//
// TODO make daemon smarter, it currently restarts on _any_ change, but
// it should restart itself only when there's something actually requiring a
// restart.
func NewDaemon(
logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts,
) (
@ -294,8 +288,6 @@ func withCurrBootstrap[Res any](
return zero, ErrInitializing
case daemonStateOk:
return fn(currBootstrap)
case daemonStateRestarting:
return zero, ErrRestarting
case daemonStateShutdown:
return zero, errors.New("already shutdown")
default:
@ -303,80 +295,49 @@ func withCurrBootstrap[Res any](
}
}
// creates a new bootstrap file using available information from the network. If
// the new bootstrap file is different than the existing one, the existing one
// is overwritten and true is returned.
func (d *daemon) checkBootstrap(
ctx context.Context, hostBootstrap bootstrap.Bootstrap,
) (
bootstrap.Bootstrap, bool, error,
) {
func (d *daemon) reload(
ctx context.Context, newHosts map[nebula.HostName]bootstrap.Host,
) error {
var (
newBootstrap = d.currBootstrap
thisHost = d.currBootstrap.ThisHost()
)
thisHost := hostBootstrap.ThisHost()
newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, hostBootstrap)
if err != nil {
return bootstrap.Bootstrap{}, false, fmt.Errorf("getting hosts from garage: %w", err)
}
newBootstrap.Hosts = newHosts
// the daemon's view of this host's bootstrap info takes precedence over
// whatever is in garage
newHosts[thisHost.Name] = thisHost
newBootstrap.Hosts[thisHost.Name] = thisHost
newHostsHash, err := bootstrap.HostsHash(newHosts)
diff, err := calcBootstrapDiff(d.daemonConfig, d.currBootstrap, newBootstrap)
if err != nil {
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of new hosts: %w", err)
return fmt.Errorf("calculating diff between bootstraps: %w", err)
} else if diff == (bootstrapDiff{}) {
return nil
}
currHostsHash, err := bootstrap.HostsHash(hostBootstrap.Hosts)
if err != nil {
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of current hosts: %w", err)
}
d.logger.Info(ctx, "Bootstrap has changed, storing new bootstrap")
d.l.Lock()
d.currBootstrap = newBootstrap
d.l.Unlock()
if bytes.Equal(newHostsHash, currHostsHash) {
return hostBootstrap, false, nil
}
var errs []error
hostBootstrap.Hosts = newHosts
return hostBootstrap, true, nil
}
// blocks until a new bootstrap is available or context is canceled
func (d *daemon) watchForChanges(ctx context.Context) bootstrap.Bootstrap {
ticker := time.NewTicker(3 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return bootstrap.Bootstrap{}
case <-ticker.C:
d.logger.Info(ctx, "Checking for changes to bootstrap")
newBootstrap, changed, err := d.checkBootstrap(
ctx, d.currBootstrap,
)
if err != nil {
d.logger.Error(ctx, "Checking bootstrap for changes failed", err)
continue
} else if !changed {
continue
}
err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, newBootstrap)
if err != nil {
d.logger.Error(ctx, "Writing new bootstrap to disk failed", err)
continue
}
return newBootstrap
if diff.dnsChanged {
d.logger.Info(ctx, "Restarting dnsmasq to account for bootstrap changes")
if err := d.children.RestartDNSMasq(newBootstrap); err != nil {
errs = append(errs, fmt.Errorf("restarting dnsmasq: %w", err))
}
}
if diff.nebulaChanged {
d.logger.Info(ctx, "Restarting nebula to account for bootstrap changes")
if err := d.children.RestartNebula(newBootstrap); err != nil {
errs = append(errs, fmt.Errorf("restarting nebula: %w", err))
}
}
return errors.Join(errs...)
}
func (d *daemon) postInit(ctx context.Context) bool {
@ -450,7 +411,10 @@ func (d *daemon) postInit(ctx context.Context) bool {
return true
}
func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
func (d *daemon) restartLoop(
ctx context.Context,
readyCh chan<- struct{},
) {
wait := func(d time.Duration) bool {
select {
case <-ctx.Done():
@ -465,15 +429,20 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
case readyCh <- struct{}{}:
default:
}
readyCh = nil
}
for {
if ctx.Err() != nil {
return
}
var (
children *Children
err error
)
for {
d.logger.Info(ctx, "Creating child processes")
children, err := NewChildren(
// TODO this could probably get moved outside of restartLoop, and into
// initialize. If it fails the error can get passed up to the caller and
// no changes can be made.
children, err = NewChildren(
ctx,
d.logger.WithNamespace("children"),
d.envBinDirPath,
@ -508,27 +477,33 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
d.l.Unlock()
ready()
break
}
newBootstrap := d.watchForChanges(ctx)
if ctx.Err() != nil {
ticker := time.NewTicker(3 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
d.logger.Info(ctx, "Checking for bootstrap changes")
newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, d.currBootstrap)
if err != nil {
d.logger.Error(ctx, "Failed to get hosts from garage", err)
continue
}
if err := d.reload(ctx, newHosts); err != nil {
d.logger.Error(ctx, "Reloading with new host data failed", err)
continue
}
ready()
}
d.logger.Info(ctx, "Bootstrap has changed, will restart daemon")
d.l.Lock()
d.currBootstrap = newBootstrap
d.state = daemonStateRestarting
d.l.Unlock()
d.logger.Info(ctx, "Shutting down previous child processes")
if err := d.children.Shutdown(); err != nil {
d.logger.Fatal(ctx, "Failed to cleanly shutdown children, there may be orphaned child processes", err)
}
// in case context was canceled while shutting the Children down, we
// don't want the Shutdown method to re-attempt calling Shutdown on
// it.
d.children = nil
}
}

View File

@ -11,10 +11,6 @@ var (
// being initialized.
ErrInitializing = jsonrpc2.NewError(2, "Network is being initialized")
// ErrRestarting is returned when a network is unavailable due to being
// restarted.
ErrRestarting = jsonrpc2.NewError(3, "Network is being restarted")
// ErrAlreadyJoined is returned when the daemon is instructed to create or
// join a new network, but it is already joined to a network.
ErrAlreadyJoined = jsonrpc2.NewError(4, "Already joined to a network")