Only restart sub-processes which need restarting on bootstrap changes
This commit is contained in:
parent
bc9a2b62ef
commit
7aa11ebe29
@ -1,10 +1,13 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
|
||||
"isle/bootstrap"
|
||||
"isle/garage/garagesrv"
|
||||
@ -77,3 +80,49 @@ func coalesceDaemonConfigAndBootstrap(
|
||||
|
||||
return hostBootstrap, nil
|
||||
}
|
||||
|
||||
type bootstrapDiff struct {
|
||||
hostsChanged bool
|
||||
nebulaChanged bool
|
||||
dnsChanged bool
|
||||
}
|
||||
|
||||
func calcBootstrapDiff(
|
||||
daemonConfig Config,
|
||||
prevBootstrap, nextBootstrap bootstrap.Bootstrap,
|
||||
) (
|
||||
diff bootstrapDiff, err error,
|
||||
) {
|
||||
{
|
||||
prevHash, prevErr := bootstrap.HostsHash(prevBootstrap.Hosts)
|
||||
nextHash, nextErr := bootstrap.HostsHash(nextBootstrap.Hosts)
|
||||
if err = errors.Join(prevErr, nextErr); err != nil {
|
||||
err = fmt.Errorf("calculating host hashes: %w", err)
|
||||
return
|
||||
}
|
||||
|
||||
diff.hostsChanged = bytes.Equal(prevHash, nextHash)
|
||||
}
|
||||
|
||||
{
|
||||
prevNebulaConfig, prevErr := nebulaConfig(daemonConfig, prevBootstrap)
|
||||
nextNebulaConfig, nextErr := nebulaConfig(daemonConfig, nextBootstrap)
|
||||
if err = errors.Join(prevErr, nextErr); err != nil {
|
||||
err = fmt.Errorf("calculating nebula config: %w", err)
|
||||
return
|
||||
}
|
||||
|
||||
diff.nebulaChanged = !reflect.DeepEqual(
|
||||
prevNebulaConfig, nextNebulaConfig,
|
||||
)
|
||||
}
|
||||
|
||||
{
|
||||
diff.dnsChanged = reflect.DeepEqual(
|
||||
dnsmasqConfig(daemonConfig, prevBootstrap),
|
||||
dnsmasqConfig(daemonConfig, nextBootstrap),
|
||||
)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
@ -12,16 +12,9 @@ import (
|
||||
"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
|
||||
)
|
||||
|
||||
func dnsmasqPmuxProcConfig(
|
||||
logger *mlog.Logger,
|
||||
runtimeDirPath, binDirPath string,
|
||||
hostBootstrap bootstrap.Bootstrap,
|
||||
daemonConfig Config,
|
||||
) (
|
||||
pmuxlib.ProcessConfig, error,
|
||||
) {
|
||||
confPath := filepath.Join(runtimeDirPath, "dnsmasq.conf")
|
||||
|
||||
func dnsmasqConfig(
|
||||
daemonConfig Config, hostBootstrap bootstrap.Bootstrap,
|
||||
) dnsmasq.ConfData {
|
||||
hostsSlice := make([]dnsmasq.ConfDataHost, 0, len(hostBootstrap.Hosts))
|
||||
for _, host := range hostBootstrap.Hosts {
|
||||
hostsSlice = append(hostsSlice, dnsmasq.ConfDataHost{
|
||||
@ -34,15 +27,48 @@ func dnsmasqPmuxProcConfig(
|
||||
return hostsSlice[i].IP < hostsSlice[j].IP
|
||||
})
|
||||
|
||||
confData := dnsmasq.ConfData{
|
||||
return dnsmasq.ConfData{
|
||||
Resolvers: daemonConfig.DNS.Resolvers,
|
||||
Domain: hostBootstrap.NetworkCreationParams.Domain,
|
||||
IP: hostBootstrap.ThisHost().IP().String(),
|
||||
Hosts: hostsSlice,
|
||||
}
|
||||
}
|
||||
|
||||
func dnsmasqWriteConfig(
|
||||
runtimeDirPath string,
|
||||
daemonConfig Config,
|
||||
hostBootstrap bootstrap.Bootstrap,
|
||||
) (
|
||||
string, error,
|
||||
) {
|
||||
var (
|
||||
confPath = filepath.Join(runtimeDirPath, "dnsmasq.conf")
|
||||
confData = dnsmasqConfig(daemonConfig, hostBootstrap)
|
||||
)
|
||||
|
||||
if err := dnsmasq.WriteConfFile(confPath, confData); err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err)
|
||||
return "", fmt.Errorf("writing dnsmasq.conf to %q: %w", confPath, err)
|
||||
}
|
||||
|
||||
return confPath, nil
|
||||
}
|
||||
|
||||
func dnsmasqPmuxProcConfig(
|
||||
logger *mlog.Logger,
|
||||
runtimeDirPath, binDirPath string,
|
||||
daemonConfig Config,
|
||||
hostBootstrap bootstrap.Bootstrap,
|
||||
) (
|
||||
pmuxlib.ProcessConfig, error,
|
||||
) {
|
||||
confPath, err := dnsmasqWriteConfig(
|
||||
runtimeDirPath, daemonConfig, hostBootstrap,
|
||||
)
|
||||
if err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf(
|
||||
"writing dnsmasq config: %w", err,
|
||||
)
|
||||
}
|
||||
|
||||
return pmuxlib.ProcessConfig{
|
||||
|
@ -46,14 +46,12 @@ func waitForNebula(
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
func nebulaPmuxProcConfig(
|
||||
runtimeDirPath, binDirPath string,
|
||||
func nebulaConfig(
|
||||
daemonConfig Config,
|
||||
hostBootstrap bootstrap.Bootstrap,
|
||||
) (
|
||||
pmuxlib.ProcessConfig, error,
|
||||
map[string]any, error,
|
||||
) {
|
||||
|
||||
var (
|
||||
lighthouseHostIPs []string
|
||||
staticHostMap = map[string][]string{}
|
||||
@ -72,23 +70,19 @@ func nebulaPmuxProcConfig(
|
||||
|
||||
caCertPEM, err := hostBootstrap.CAPublicCredentials.Cert.Unwrap().MarshalToPEM()
|
||||
if err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf(
|
||||
"marshaling CA cert to PEM: :%w", err,
|
||||
)
|
||||
return nil, fmt.Errorf("marshaling CA cert to PEM: :%w", err)
|
||||
}
|
||||
|
||||
hostCertPEM, err := hostBootstrap.PublicCredentials.Cert.Unwrap().MarshalToPEM()
|
||||
if err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf(
|
||||
"marshaling host cert to PEM: :%w", err,
|
||||
)
|
||||
return nil, fmt.Errorf("marshaling host cert to PEM: :%w", err)
|
||||
}
|
||||
|
||||
hostKeyPEM := cert.MarshalX25519PrivateKey(
|
||||
hostBootstrap.PrivateCredentials.EncryptingPrivateKey.Bytes(),
|
||||
)
|
||||
|
||||
config := map[string]interface{}{
|
||||
config := map[string]any{
|
||||
"pki": map[string]string{
|
||||
"ca": string(caCertPEM),
|
||||
"cert": string(hostCertPEM),
|
||||
@ -99,7 +93,7 @@ func nebulaPmuxProcConfig(
|
||||
"punch": true,
|
||||
"respond": true,
|
||||
},
|
||||
"tun": map[string]interface{}{
|
||||
"tun": map[string]any{
|
||||
"dev": daemonConfig.VPN.Tun.Device,
|
||||
},
|
||||
"firewall": daemonConfig.VPN.Firewall,
|
||||
@ -112,7 +106,7 @@ func nebulaPmuxProcConfig(
|
||||
"port": "0",
|
||||
}
|
||||
|
||||
config["lighthouse"] = map[string]interface{}{
|
||||
config["lighthouse"] = map[string]any{
|
||||
"hosts": lighthouseHostIPs,
|
||||
}
|
||||
|
||||
@ -121,7 +115,9 @@ func nebulaPmuxProcConfig(
|
||||
_, port, err := net.SplitHostPort(publicAddr)
|
||||
|
||||
if err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf("parsing public address %q: %w", publicAddr, err)
|
||||
return nil, fmt.Errorf(
|
||||
"parsing public address %q: %w", publicAddr, err,
|
||||
)
|
||||
}
|
||||
|
||||
config["listen"] = map[string]string{
|
||||
@ -129,16 +125,55 @@ func nebulaPmuxProcConfig(
|
||||
"port": port,
|
||||
}
|
||||
|
||||
config["lighthouse"] = map[string]interface{}{
|
||||
config["lighthouse"] = map[string]any{
|
||||
"hosts": []string{},
|
||||
"am_lighthouse": true,
|
||||
}
|
||||
}
|
||||
|
||||
return config, nil
|
||||
}
|
||||
|
||||
func nebulaWriteConfig(
|
||||
runtimeDirPath string,
|
||||
daemonConfig Config,
|
||||
hostBootstrap bootstrap.Bootstrap,
|
||||
) (
|
||||
string, error,
|
||||
) {
|
||||
config, err := nebulaConfig(daemonConfig, hostBootstrap)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("creating nebula config: %w", err)
|
||||
}
|
||||
|
||||
nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml")
|
||||
|
||||
if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err)
|
||||
return "", fmt.Errorf("writing nebula.yml to %q: %w", nebulaYmlPath, err)
|
||||
}
|
||||
|
||||
return nebulaYmlPath, nil
|
||||
}
|
||||
|
||||
func nebulaPmuxProcConfig(
|
||||
runtimeDirPath, binDirPath string,
|
||||
daemonConfig Config,
|
||||
hostBootstrap bootstrap.Bootstrap,
|
||||
) (
|
||||
pmuxlib.ProcessConfig, error,
|
||||
) {
|
||||
config, err := nebulaConfig(daemonConfig, hostBootstrap)
|
||||
if err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf(
|
||||
"creating nebula config: %w", err,
|
||||
)
|
||||
}
|
||||
|
||||
nebulaYmlPath := filepath.Join(runtimeDirPath, "nebula.yml")
|
||||
if err := yamlutil.WriteYamlFile(config, nebulaYmlPath, 0600); err != nil {
|
||||
return pmuxlib.ProcessConfig{}, fmt.Errorf(
|
||||
"writing nebula.yml to %q: %w", nebulaYmlPath, err,
|
||||
)
|
||||
}
|
||||
|
||||
return pmuxlib.ProcessConfig{
|
||||
|
@ -31,8 +31,8 @@ func (c *Children) newPmuxConfig(
|
||||
c.logger,
|
||||
c.opts.EnvVars.RuntimeDirPath,
|
||||
binDirPath,
|
||||
hostBootstrap,
|
||||
daemonConfig,
|
||||
hostBootstrap,
|
||||
)
|
||||
if err != nil {
|
||||
return pmuxlib.Config{}, fmt.Errorf(
|
||||
|
@ -17,8 +17,9 @@ import (
|
||||
// - dnsmasq
|
||||
// - garage (0 or more, depending on configured storage allocations)
|
||||
type Children struct {
|
||||
logger *mlog.Logger
|
||||
opts Opts
|
||||
logger *mlog.Logger
|
||||
daemonConfig Config
|
||||
opts Opts
|
||||
|
||||
pmux *pmuxlib.Pmux
|
||||
}
|
||||
@ -46,8 +47,9 @@ func NewChildren(
|
||||
}
|
||||
|
||||
c := &Children{
|
||||
logger: logger,
|
||||
opts: *opts,
|
||||
logger: logger,
|
||||
daemonConfig: daemonConfig,
|
||||
opts: *opts,
|
||||
}
|
||||
|
||||
pmuxConfig, err := c.newPmuxConfig(
|
||||
@ -84,6 +86,32 @@ func NewChildren(
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// RestartDNSMasq rewrites the dnsmasq config and restarts the process.
|
||||
func (c *Children) RestartDNSMasq(hostBootstrap bootstrap.Bootstrap) error {
|
||||
_, err := dnsmasqWriteConfig(
|
||||
c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("writing new dnsmasq config: %w", err)
|
||||
}
|
||||
|
||||
c.pmux.Restart("dnsmasq")
|
||||
return nil
|
||||
}
|
||||
|
||||
// RestartNebula rewrites the nebula config and restarts the process.
|
||||
func (c *Children) RestartNebula(hostBootstrap bootstrap.Bootstrap) error {
|
||||
_, err := nebulaWriteConfig(
|
||||
c.opts.EnvVars.RuntimeDirPath, c.daemonConfig, hostBootstrap,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("writing a new nebula config: %w", err)
|
||||
}
|
||||
|
||||
c.pmux.Restart("nebula")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Shutdown blocks until all child processes have gracefully shut themselves
|
||||
// down.
|
||||
//
|
||||
|
@ -3,7 +3,6 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
@ -146,7 +145,6 @@ const (
|
||||
daemonStateNoNetwork = iota
|
||||
daemonStateInitializing
|
||||
daemonStateOk
|
||||
daemonStateRestarting
|
||||
daemonStateShutdown
|
||||
)
|
||||
|
||||
@ -182,10 +180,6 @@ type daemon struct {
|
||||
// While still starting up the Daemon for the first time all methods will return
|
||||
// ErrInitializing, except Shutdown which will block until initialization is
|
||||
// canceled.
|
||||
//
|
||||
// TODO make daemon smarter, it currently restarts on _any_ change, but
|
||||
// it should restart itself only when there's something actually requiring a
|
||||
// restart.
|
||||
func NewDaemon(
|
||||
logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts,
|
||||
) (
|
||||
@ -294,8 +288,6 @@ func withCurrBootstrap[Res any](
|
||||
return zero, ErrInitializing
|
||||
case daemonStateOk:
|
||||
return fn(currBootstrap)
|
||||
case daemonStateRestarting:
|
||||
return zero, ErrRestarting
|
||||
case daemonStateShutdown:
|
||||
return zero, errors.New("already shutdown")
|
||||
default:
|
||||
@ -303,80 +295,49 @@ func withCurrBootstrap[Res any](
|
||||
}
|
||||
}
|
||||
|
||||
// creates a new bootstrap file using available information from the network. If
|
||||
// the new bootstrap file is different than the existing one, the existing one
|
||||
// is overwritten and true is returned.
|
||||
func (d *daemon) checkBootstrap(
|
||||
ctx context.Context, hostBootstrap bootstrap.Bootstrap,
|
||||
) (
|
||||
bootstrap.Bootstrap, bool, error,
|
||||
) {
|
||||
func (d *daemon) reload(
|
||||
ctx context.Context, newHosts map[nebula.HostName]bootstrap.Host,
|
||||
) error {
|
||||
var (
|
||||
newBootstrap = d.currBootstrap
|
||||
thisHost = d.currBootstrap.ThisHost()
|
||||
)
|
||||
|
||||
thisHost := hostBootstrap.ThisHost()
|
||||
|
||||
newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, hostBootstrap)
|
||||
if err != nil {
|
||||
return bootstrap.Bootstrap{}, false, fmt.Errorf("getting hosts from garage: %w", err)
|
||||
}
|
||||
newBootstrap.Hosts = newHosts
|
||||
|
||||
// the daemon's view of this host's bootstrap info takes precedence over
|
||||
// whatever is in garage
|
||||
newHosts[thisHost.Name] = thisHost
|
||||
newBootstrap.Hosts[thisHost.Name] = thisHost
|
||||
|
||||
newHostsHash, err := bootstrap.HostsHash(newHosts)
|
||||
diff, err := calcBootstrapDiff(d.daemonConfig, d.currBootstrap, newBootstrap)
|
||||
if err != nil {
|
||||
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of new hosts: %w", err)
|
||||
return fmt.Errorf("calculating diff between bootstraps: %w", err)
|
||||
} else if diff == (bootstrapDiff{}) {
|
||||
return nil
|
||||
}
|
||||
|
||||
currHostsHash, err := bootstrap.HostsHash(hostBootstrap.Hosts)
|
||||
if err != nil {
|
||||
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of current hosts: %w", err)
|
||||
}
|
||||
d.logger.Info(ctx, "Bootstrap has changed, storing new bootstrap")
|
||||
d.l.Lock()
|
||||
d.currBootstrap = newBootstrap
|
||||
d.l.Unlock()
|
||||
|
||||
if bytes.Equal(newHostsHash, currHostsHash) {
|
||||
return hostBootstrap, false, nil
|
||||
}
|
||||
var errs []error
|
||||
|
||||
hostBootstrap.Hosts = newHosts
|
||||
|
||||
return hostBootstrap, true, nil
|
||||
}
|
||||
|
||||
// blocks until a new bootstrap is available or context is canceled
|
||||
func (d *daemon) watchForChanges(ctx context.Context) bootstrap.Bootstrap {
|
||||
|
||||
ticker := time.NewTicker(3 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
|
||||
case <-ctx.Done():
|
||||
return bootstrap.Bootstrap{}
|
||||
|
||||
case <-ticker.C:
|
||||
|
||||
d.logger.Info(ctx, "Checking for changes to bootstrap")
|
||||
|
||||
newBootstrap, changed, err := d.checkBootstrap(
|
||||
ctx, d.currBootstrap,
|
||||
)
|
||||
if err != nil {
|
||||
d.logger.Error(ctx, "Checking bootstrap for changes failed", err)
|
||||
continue
|
||||
} else if !changed {
|
||||
continue
|
||||
}
|
||||
|
||||
err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, newBootstrap)
|
||||
if err != nil {
|
||||
d.logger.Error(ctx, "Writing new bootstrap to disk failed", err)
|
||||
continue
|
||||
}
|
||||
|
||||
return newBootstrap
|
||||
if diff.dnsChanged {
|
||||
d.logger.Info(ctx, "Restarting dnsmasq to account for bootstrap changes")
|
||||
if err := d.children.RestartDNSMasq(newBootstrap); err != nil {
|
||||
errs = append(errs, fmt.Errorf("restarting dnsmasq: %w", err))
|
||||
}
|
||||
}
|
||||
|
||||
if diff.nebulaChanged {
|
||||
d.logger.Info(ctx, "Restarting nebula to account for bootstrap changes")
|
||||
if err := d.children.RestartNebula(newBootstrap); err != nil {
|
||||
errs = append(errs, fmt.Errorf("restarting nebula: %w", err))
|
||||
}
|
||||
}
|
||||
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
|
||||
func (d *daemon) postInit(ctx context.Context) bool {
|
||||
@ -450,7 +411,10 @@ func (d *daemon) postInit(ctx context.Context) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
|
||||
func (d *daemon) restartLoop(
|
||||
ctx context.Context,
|
||||
readyCh chan<- struct{},
|
||||
) {
|
||||
wait := func(d time.Duration) bool {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@ -465,15 +429,20 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
|
||||
case readyCh <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
readyCh = nil
|
||||
}
|
||||
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
var (
|
||||
children *Children
|
||||
err error
|
||||
)
|
||||
|
||||
for {
|
||||
d.logger.Info(ctx, "Creating child processes")
|
||||
children, err := NewChildren(
|
||||
// TODO this could probably get moved outside of restartLoop, and into
|
||||
// initialize. If it fails the error can get passed up to the caller and
|
||||
// no changes can be made.
|
||||
children, err = NewChildren(
|
||||
ctx,
|
||||
d.logger.WithNamespace("children"),
|
||||
d.envBinDirPath,
|
||||
@ -508,27 +477,33 @@ func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
|
||||
d.l.Unlock()
|
||||
|
||||
ready()
|
||||
break
|
||||
}
|
||||
|
||||
newBootstrap := d.watchForChanges(ctx)
|
||||
if ctx.Err() != nil {
|
||||
ticker := time.NewTicker(3 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
|
||||
case <-ticker.C:
|
||||
|
||||
d.logger.Info(ctx, "Checking for bootstrap changes")
|
||||
newHosts, err := d.getGarageBootstrapHosts(ctx, d.logger, d.currBootstrap)
|
||||
if err != nil {
|
||||
d.logger.Error(ctx, "Failed to get hosts from garage", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := d.reload(ctx, newHosts); err != nil {
|
||||
d.logger.Error(ctx, "Reloading with new host data failed", err)
|
||||
continue
|
||||
}
|
||||
|
||||
ready()
|
||||
}
|
||||
|
||||
d.logger.Info(ctx, "Bootstrap has changed, will restart daemon")
|
||||
d.l.Lock()
|
||||
d.currBootstrap = newBootstrap
|
||||
d.state = daemonStateRestarting
|
||||
d.l.Unlock()
|
||||
|
||||
d.logger.Info(ctx, "Shutting down previous child processes")
|
||||
if err := d.children.Shutdown(); err != nil {
|
||||
d.logger.Fatal(ctx, "Failed to cleanly shutdown children, there may be orphaned child processes", err)
|
||||
}
|
||||
|
||||
// in case context was canceled while shutting the Children down, we
|
||||
// don't want the Shutdown method to re-attempt calling Shutdown on
|
||||
// it.
|
||||
d.children = nil
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -11,10 +11,6 @@ var (
|
||||
// being initialized.
|
||||
ErrInitializing = jsonrpc2.NewError(2, "Network is being initialized")
|
||||
|
||||
// ErrRestarting is returned when a network is unavailable due to being
|
||||
// restarted.
|
||||
ErrRestarting = jsonrpc2.NewError(3, "Network is being restarted")
|
||||
|
||||
// ErrAlreadyJoined is returned when the daemon is instructed to create or
|
||||
// join a new network, but it is already joined to a network.
|
||||
ErrAlreadyJoined = jsonrpc2.NewError(4, "Already joined to a network")
|
||||
|
Loading…
Reference in New Issue
Block a user