590 lines
14 KiB
Go
590 lines
14 KiB
Go
// Package daemon implements the isle daemon, which is a long-running service
|
|
// managing all isle background tasks and sub-processes for a single network.
|
|
package daemon
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"isle/admin"
|
|
"isle/bootstrap"
|
|
"isle/garage"
|
|
"isle/nebula"
|
|
"os"
|
|
"sync"
|
|
"time"
|
|
|
|
"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
|
|
)
|
|
|
|
// Daemon presents all functionality required for client frontends to interact
|
|
// with isle, typically via the unix socket.
|
|
type Daemon interface {
|
|
|
|
// CreateNetwork will initialize a new network using the given parameters.
|
|
// - name: Human-readable name of the network.
|
|
// - domain: Primary domain name that network services are served under.
|
|
// - ipNet: An IP subnet, in CIDR form, which will be the overall range of
|
|
// possible IPs in the network. The first IP in this network range will
|
|
// become this first host's IP.
|
|
// - hostName: The name of this first host in the network.
|
|
//
|
|
// An Admin instance is returned, which is necessary to perform admin
|
|
// actions in the future.
|
|
CreateNetwork(
|
|
ctx context.Context, name, domain string,
|
|
ipNet nebula.IPNet,
|
|
hostName nebula.HostName,
|
|
) (
|
|
admin.Admin, error,
|
|
)
|
|
|
|
// JoinNetwork joins the Daemon to an existing network using the given
|
|
// Bootstrap.
|
|
//
|
|
// Errors:
|
|
// - ErrAlreadyJoined
|
|
JoinNetwork(context.Context, bootstrap.Bootstrap) error
|
|
|
|
// GetBootstraps returns the currently active Bootstrap.
|
|
GetBootstrap(context.Context) (bootstrap.Bootstrap, error)
|
|
|
|
// Shutdown blocks until all resources held or created by the daemon,
|
|
// including child processes it has started, have been cleaned up.
|
|
//
|
|
// If this returns an error then it's possible that child processes are
|
|
// still running and are no longer managed.
|
|
Shutdown() error
|
|
}
|
|
|
|
// Opts are optional parameters which can be passed in when initializing a new
|
|
// Daemon instance. A nil Opts is equivalent to a zero value.
|
|
type Opts struct {
|
|
// Stdout and Stderr are what the associated outputs from child processes
|
|
// will be directed to.
|
|
Stdout, Stderr io.Writer
|
|
|
|
EnvVars EnvVars // Defaults to that returned by GetEnvVars.
|
|
}
|
|
|
|
func (o *Opts) withDefaults() *Opts {
|
|
if o == nil {
|
|
o = new(Opts)
|
|
}
|
|
|
|
if o.Stdout == nil {
|
|
o.Stdout = os.Stdout
|
|
}
|
|
|
|
if o.Stderr == nil {
|
|
o.Stderr = os.Stderr
|
|
}
|
|
|
|
if o.EnvVars == (EnvVars{}) {
|
|
o.EnvVars = GetEnvVars()
|
|
}
|
|
|
|
return o
|
|
}
|
|
|
|
const (
|
|
daemonStateNoNetwork = iota
|
|
daemonStateInitializing
|
|
daemonStateOk
|
|
daemonStateRestarting
|
|
daemonStateShutdown
|
|
)
|
|
|
|
type daemon struct {
|
|
logger *mlog.Logger
|
|
daemonConfig Config
|
|
envBinDirPath string
|
|
opts *Opts
|
|
|
|
l sync.RWMutex
|
|
state int
|
|
children *Children
|
|
currBootstrap bootstrap.Bootstrap
|
|
|
|
shutdownCh chan struct{}
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
// NewDaemon initializes and returns a Daemon instance which will manage all
|
|
// child processes and state required by the isle service, as well as an HTTP
|
|
// socket over which RPC calls will be served.
|
|
//
|
|
// Inner Children instance(s) will be wrapped such that they will be
|
|
// automatically shutdown and re-created whenever there's changes in the network
|
|
// which require the configuration to be changed (e.g. a new nebula lighthouse).
|
|
// During such an inner restart all methods will return ErrRestarting, except
|
|
// Shutdown which will block until the currently executing restart is finished
|
|
// and then shutdown cleanly from there.
|
|
//
|
|
// While still starting up the Daemon for the first time all methods will return
|
|
// ErrInitializing, except Shutdown which will block until initialization is
|
|
// canceled.
|
|
//
|
|
// TODO make daemon smarter, it currently restarts on _any_ change, but
|
|
// it should restart itself only when there's something actually requiring a
|
|
// restart.
|
|
func NewDaemon(
|
|
logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts,
|
|
) (
|
|
Daemon, error,
|
|
) {
|
|
var (
|
|
d = &daemon{
|
|
logger: logger,
|
|
daemonConfig: daemonConfig,
|
|
envBinDirPath: envBinDirPath,
|
|
opts: opts.withDefaults(),
|
|
shutdownCh: make(chan struct{}),
|
|
}
|
|
bootstrapFilePath = bootstrap.StateDirPath(d.opts.EnvVars.StateDirPath)
|
|
)
|
|
|
|
currBootstrap, err := bootstrap.FromFile(bootstrapFilePath)
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
// daemon has never had a network created or joined
|
|
} else if err != nil {
|
|
return nil, fmt.Errorf(
|
|
"loading bootstrap from %q: %w", bootstrapFilePath, err,
|
|
)
|
|
} else if err := d.initialize(currBootstrap, nil); err != nil {
|
|
return nil, fmt.Errorf("initializing with bootstrap: %w", err)
|
|
}
|
|
|
|
return d, nil
|
|
}
|
|
|
|
// initialize must be called with d.l write lock held, _but_ the lock should be
|
|
// released just after initialize returns.
|
|
//
|
|
// readyCh will be written to everytime daemon changes state to daemonStateOk,
|
|
// unless it is nil or blocks.
|
|
func (d *daemon) initialize(
|
|
currBootstrap bootstrap.Bootstrap, readyCh chan<- struct{},
|
|
) error {
|
|
// we update this Host's data using whatever configuration has been provided
|
|
// by the daemon config. This way the daemon has the most up-to-date
|
|
// possible bootstrap. This updated bootstrap will later get updated in
|
|
// garage as a background daemon task, so other hosts will see it as well.
|
|
currBootstrap, err := coalesceDaemonConfigAndBootstrap(
|
|
d.daemonConfig, currBootstrap,
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("combining daemon configuration into bootstrap: %w", err)
|
|
}
|
|
|
|
err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, currBootstrap)
|
|
if err != nil {
|
|
return fmt.Errorf("writing bootstrap to state dir: %w", err)
|
|
}
|
|
|
|
d.currBootstrap = currBootstrap
|
|
d.state = daemonStateInitializing
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
d.wg.Add(1)
|
|
go func() {
|
|
defer d.wg.Done()
|
|
<-d.shutdownCh
|
|
cancel()
|
|
}()
|
|
|
|
d.wg.Add(1)
|
|
go func() {
|
|
defer d.wg.Done()
|
|
d.restartLoop(ctx, readyCh)
|
|
d.logger.Debug(ctx, "Daemon restart loop stopped")
|
|
}()
|
|
|
|
return nil
|
|
}
|
|
|
|
func withCurrBootstrap[Res any](
|
|
d *daemon, fn func(bootstrap.Bootstrap) (Res, error),
|
|
) (Res, error) {
|
|
var zero Res
|
|
d.l.RLock()
|
|
defer d.l.RUnlock()
|
|
|
|
currBootstrap, state := d.currBootstrap, d.state
|
|
|
|
switch state {
|
|
case daemonStateNoNetwork:
|
|
return zero, ErrNoNetwork
|
|
case daemonStateInitializing:
|
|
return zero, ErrInitializing
|
|
case daemonStateOk:
|
|
return fn(currBootstrap)
|
|
case daemonStateRestarting:
|
|
return zero, ErrRestarting
|
|
case daemonStateShutdown:
|
|
return zero, errors.New("already shutdown")
|
|
default:
|
|
panic(fmt.Sprintf("unknown state %d", d.state))
|
|
}
|
|
}
|
|
|
|
// creates a new bootstrap file using available information from the network. If
|
|
// the new bootstrap file is different than the existing one, the existing one
|
|
// is overwritten and true is returned.
|
|
func (d *daemon) checkBootstrap(
|
|
ctx context.Context, hostBootstrap bootstrap.Bootstrap,
|
|
) (
|
|
bootstrap.Bootstrap, bool, error,
|
|
) {
|
|
|
|
thisHost := hostBootstrap.ThisHost()
|
|
|
|
newHosts, err := getGarageBootstrapHosts(ctx, d.logger, hostBootstrap)
|
|
if err != nil {
|
|
return bootstrap.Bootstrap{}, false, fmt.Errorf("getting hosts from garage: %w", err)
|
|
}
|
|
|
|
// the daemon's view of this host's bootstrap info takes precedence over
|
|
// whatever is in garage
|
|
newHosts[thisHost.Name] = thisHost
|
|
|
|
newHostsHash, err := bootstrap.HostsHash(newHosts)
|
|
if err != nil {
|
|
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of new hosts: %w", err)
|
|
}
|
|
|
|
currHostsHash, err := bootstrap.HostsHash(hostBootstrap.Hosts)
|
|
if err != nil {
|
|
return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of current hosts: %w", err)
|
|
}
|
|
|
|
if bytes.Equal(newHostsHash, currHostsHash) {
|
|
return hostBootstrap, false, nil
|
|
}
|
|
|
|
hostBootstrap.Hosts = newHosts
|
|
|
|
return hostBootstrap, true, nil
|
|
}
|
|
|
|
// blocks until a new bootstrap is available or context is canceled
|
|
func (d *daemon) watchForChanges(ctx context.Context) bootstrap.Bootstrap {
|
|
|
|
ticker := time.NewTicker(3 * time.Minute)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
return bootstrap.Bootstrap{}
|
|
|
|
case <-ticker.C:
|
|
|
|
d.logger.Info(ctx, "Checking for changes to bootstrap")
|
|
|
|
newBootstrap, changed, err := d.checkBootstrap(
|
|
ctx, d.currBootstrap,
|
|
)
|
|
if err != nil {
|
|
d.logger.Error(ctx, "Checking bootstrap for changes failed", err)
|
|
continue
|
|
} else if !changed {
|
|
continue
|
|
}
|
|
|
|
err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, newBootstrap)
|
|
if err != nil {
|
|
d.logger.Error(ctx, "Writing new bootstrap to disk failed", err)
|
|
continue
|
|
}
|
|
|
|
return newBootstrap
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *daemon) postInit(ctx context.Context) bool {
|
|
if len(d.daemonConfig.Storage.Allocations) > 0 {
|
|
if !until(
|
|
ctx,
|
|
d.logger,
|
|
"Applying garage layout",
|
|
func(ctx context.Context) error {
|
|
return GarageApplyLayout(
|
|
ctx, d.logger, d.daemonConfig, d.currBootstrap,
|
|
)
|
|
},
|
|
) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// This is only necessary during network creation, otherwise the bootstrap
|
|
// should already have these credentials built in.
|
|
//
|
|
// TODO this is pretty hacky, but there doesn't seem to be a better way to
|
|
// manage it at the moment.
|
|
if d.currBootstrap.Garage.GlobalBucketS3APICredentials == (garage.S3APICredentials{}) {
|
|
currBootstrap := d.currBootstrap
|
|
if !until(
|
|
ctx,
|
|
d.logger,
|
|
"Initializing garage shared global bucket",
|
|
func(ctx context.Context) error {
|
|
garageGlobalBucketCreds, err := garageInitializeGlobalBucket(
|
|
ctx, d.logger, d.daemonConfig, d.currBootstrap,
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("initializing global bucket: %w", err)
|
|
}
|
|
|
|
currBootstrap.Garage.GlobalBucketS3APICredentials = garageGlobalBucketCreds
|
|
|
|
d.logger.Info(ctx, "Writing bootstrap to state directory")
|
|
err = writeBootstrapToStateDir(
|
|
d.opts.EnvVars.StateDirPath, currBootstrap,
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("writing bootstrap to state dir: %w", err)
|
|
}
|
|
|
|
return nil
|
|
},
|
|
) {
|
|
return false
|
|
}
|
|
|
|
d.l.Lock()
|
|
d.currBootstrap = currBootstrap
|
|
d.l.Unlock()
|
|
}
|
|
|
|
if !until(
|
|
ctx,
|
|
d.logger,
|
|
"Updating host info in garage",
|
|
func(ctx context.Context) error {
|
|
return d.putGarageBoostrapHost(ctx)
|
|
},
|
|
) {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) {
|
|
wait := func(d time.Duration) bool {
|
|
select {
|
|
case <-ctx.Done():
|
|
return false
|
|
case <-time.After(d):
|
|
return true
|
|
}
|
|
}
|
|
|
|
ready := func() {
|
|
select {
|
|
case readyCh <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
for {
|
|
if ctx.Err() != nil {
|
|
return
|
|
}
|
|
|
|
d.logger.Info(ctx, "Creating child processes")
|
|
children, err := NewChildren(
|
|
ctx,
|
|
d.logger.WithNamespace("children"),
|
|
d.daemonConfig,
|
|
d.currBootstrap,
|
|
d.envBinDirPath,
|
|
d.opts,
|
|
)
|
|
if errors.Is(err, context.Canceled) {
|
|
return
|
|
} else if err != nil {
|
|
d.logger.Error(ctx, "failed to initialize daemon", err)
|
|
if !wait(1 * time.Second) {
|
|
return
|
|
}
|
|
continue
|
|
}
|
|
|
|
d.logger.Info(ctx, "Child processes created")
|
|
|
|
d.l.Lock()
|
|
d.children = children
|
|
d.l.Unlock()
|
|
|
|
if !d.postInit(ctx) {
|
|
return
|
|
}
|
|
|
|
d.l.Lock()
|
|
d.state = daemonStateOk
|
|
d.l.Unlock()
|
|
|
|
ready()
|
|
|
|
newBootstrap := d.watchForChanges(ctx)
|
|
if ctx.Err() != nil {
|
|
return
|
|
}
|
|
|
|
d.logger.Info(ctx, "Bootstrap has changed, will restart daemon")
|
|
d.l.Lock()
|
|
d.currBootstrap = newBootstrap
|
|
d.state = daemonStateRestarting
|
|
d.l.Unlock()
|
|
|
|
d.logger.Info(ctx, "Shutting down previous child processes")
|
|
if err := d.children.Shutdown(); err != nil {
|
|
d.logger.Fatal(ctx, "Failed to cleanly shutdown children, there may be orphaned child processes", err)
|
|
}
|
|
|
|
// in case context was canceled while shutting the Children down, we
|
|
// don't want the Shutdown method to re-attempt calling Shutdown on
|
|
// it.
|
|
d.children = nil
|
|
}
|
|
}
|
|
|
|
func (d *daemon) CreateNetwork(
|
|
ctx context.Context,
|
|
name, domain string, ipNet nebula.IPNet, hostName nebula.HostName,
|
|
) (
|
|
admin.Admin, error,
|
|
) {
|
|
nebulaCACreds, err := nebula.NewCACredentials(domain, ipNet)
|
|
if err != nil {
|
|
return admin.Admin{}, fmt.Errorf("creating nebula CA cert: %w", err)
|
|
}
|
|
|
|
adm := admin.Admin{
|
|
CreationParams: admin.CreationParams{
|
|
ID: randStr(32),
|
|
Name: name,
|
|
Domain: domain,
|
|
},
|
|
}
|
|
|
|
garageBootstrap := bootstrap.Garage{
|
|
RPCSecret: randStr(32),
|
|
AdminToken: randStr(32),
|
|
}
|
|
|
|
hostBootstrap, err := bootstrap.New(
|
|
nebulaCACreds,
|
|
adm.CreationParams,
|
|
garageBootstrap,
|
|
hostName,
|
|
ipNet.FirstAddr(),
|
|
)
|
|
if err != nil {
|
|
return adm, fmt.Errorf("initializing bootstrap data: %w", err)
|
|
}
|
|
|
|
d.l.Lock()
|
|
|
|
if d.state != daemonStateNoNetwork {
|
|
d.l.Unlock()
|
|
return adm, ErrAlreadyJoined
|
|
}
|
|
|
|
if len(d.daemonConfig.Storage.Allocations) < 3 {
|
|
d.l.Unlock()
|
|
return adm, ErrInvalidConfig.WithData(
|
|
"At least three storage allocations are required.",
|
|
)
|
|
}
|
|
|
|
readyCh := make(chan struct{}, 1)
|
|
|
|
err = d.initialize(hostBootstrap, readyCh)
|
|
d.l.Unlock()
|
|
if err != nil {
|
|
return adm, fmt.Errorf("initializing daemon: %w", err)
|
|
}
|
|
|
|
select {
|
|
case <-readyCh:
|
|
case <-ctx.Done():
|
|
return adm, ctx.Err()
|
|
}
|
|
|
|
// As part of postInit, which is called prior to ready(), the restartLoop
|
|
// will check if the global bucket creds have been created yet or not, and
|
|
// create them if so. So once ready() is called we can get the created creds
|
|
// from the currBootstrap
|
|
d.l.RLock()
|
|
garageGlobalBucketCreds := d.currBootstrap.Garage.GlobalBucketS3APICredentials
|
|
d.l.RUnlock()
|
|
|
|
adm.Nebula.CACredentials = nebulaCACreds
|
|
adm.Garage.RPCSecret = hostBootstrap.Garage.RPCSecret
|
|
adm.Garage.GlobalBucketS3APICredentials = garageGlobalBucketCreds
|
|
|
|
return adm, nil
|
|
}
|
|
|
|
func (d *daemon) JoinNetwork(
|
|
ctx context.Context, newBootstrap bootstrap.Bootstrap,
|
|
) error {
|
|
d.l.Lock()
|
|
|
|
if d.state != daemonStateNoNetwork {
|
|
d.l.Unlock()
|
|
return ErrAlreadyJoined
|
|
}
|
|
|
|
readyCh := make(chan struct{}, 1)
|
|
|
|
err := d.initialize(newBootstrap, readyCh)
|
|
d.l.Unlock()
|
|
if err != nil {
|
|
return fmt.Errorf("initializing daemon: %w", err)
|
|
}
|
|
|
|
select {
|
|
case <-readyCh:
|
|
return nil
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
|
|
func (d *daemon) GetBootstrap(ctx context.Context) (bootstrap.Bootstrap, error) {
|
|
return withCurrBootstrap(d, func(
|
|
currBootstrap bootstrap.Bootstrap,
|
|
) (
|
|
bootstrap.Bootstrap, error,
|
|
) {
|
|
return currBootstrap, nil
|
|
})
|
|
}
|
|
|
|
func (d *daemon) Shutdown() error {
|
|
d.l.Lock()
|
|
defer d.l.Unlock()
|
|
|
|
close(d.shutdownCh)
|
|
d.wg.Wait()
|
|
d.state = daemonStateShutdown
|
|
|
|
if d.children != nil {
|
|
if err := d.children.Shutdown(); err != nil {
|
|
return fmt.Errorf("shutting down children: %w", err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|