// Package daemon implements the isle daemon, which is a long-running service // managing all isle background tasks and sub-processes for a single network. package daemon import ( "bytes" "context" "errors" "fmt" "io" "io/fs" "isle/admin" "isle/bootstrap" "isle/daemon/jsonrpc2" "isle/garage" "isle/nebula" "net" "os" "regexp" "sync" "time" "dev.mediocregopher.com/mediocre-go-lib.git/mlog" ) var hostNameRegexp = regexp.MustCompile(`^[a-z][a-z0-9\-]*$`) func validateHostName(name string) error { if !hostNameRegexp.MatchString(name) { return errors.New("a host's name must start with a letter and only contain letters, numbers, and dashes") } return nil } // Daemon presents all functionality required for client frontends to interact // with isle, typically via the unix socket. type Daemon interface { // CreateNetwork will initialize a new network using the given parameters. // - name: Human-readable name of the network. // - domain: Primary domain name that network services are served under. // - ipNetStr: An IP + subnet mask which represents both the IP of this // first host in the network, as well as the overall range of possible IPs // in the network. // - hostName: The name of this first host in the network. // // An Admin instance is returned, which is necessary to perform admin // actions in the future. CreateNetwork( ctx context.Context, name, domain, ipNetStr, hostName string, ) ( admin.Admin, error, ) // JoinNetwork joins the Daemon to an existing network using the given // Bootstrap. // // Errors: // - ErrAlreadyJoined JoinNetwork(context.Context, bootstrap.Bootstrap) error // GetGarageBootstrapHosts loads (and verifies) the .json.signed // file for all hosts stored in garage. GetGarageBootstrapHosts( ctx context.Context, ) ( map[string]bootstrap.Host, error, ) // Shutdown blocks until all resources held or created by the daemon, // including child processes it has started, have been cleaned up. // // If this returns an error then it's possible that child processes are // still running and are no longer managed. Shutdown() error } // Opts are optional parameters which can be passed in when initializing a new // Daemon instance. A nil Opts is equivalent to a zero value. type Opts struct { // Stdout and Stderr are what the associated outputs from child processes // will be directed to. Stdout, Stderr io.Writer EnvVars EnvVars // Defaults to that returned by GetEnvVars. } func (o *Opts) withDefaults() *Opts { if o == nil { o = new(Opts) } if o.Stdout == nil { o.Stdout = os.Stdout } if o.Stderr == nil { o.Stderr = os.Stderr } if o.EnvVars == (EnvVars{}) { o.EnvVars = GetEnvVars() } return o } const ( daemonStateNoNetwork = iota daemonStateInitializing daemonStateOk daemonStateRestarting daemonStateShutdown ) type daemon struct { logger *mlog.Logger daemonConfig Config envBinDirPath string opts *Opts l sync.RWMutex state int children *Children currBootstrap bootstrap.Bootstrap shutdownCh chan struct{} wg sync.WaitGroup } // NewDaemon initializes and returns a Daemon instance which will manage all // child processes and state required by the isle service, as well as an HTTP // socket over which RPC calls will be served. // // Inner Children instance(s) will be wrapped such that they will be // automatically shutdown and re-created whenever there's changes in the network // which require the configuration to be changed (e.g. a new nebula lighthouse). // During such an inner restart all methods will return ErrRestarting, except // Shutdown which will block until the currently executing restart is finished // and then shutdown cleanly from there. // // While still starting up the Daemon for the first time all methods will return // ErrInitializing, except Shutdown which will block until initialization is // canceled. // // TODO make daemon smarter, it currently restarts on _any_ change, but // it should restart itself only when there's something actually requiring a // restart. func NewDaemon( logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts, ) ( Daemon, error, ) { var ( d = &daemon{ logger: logger, daemonConfig: daemonConfig, envBinDirPath: envBinDirPath, opts: opts.withDefaults(), shutdownCh: make(chan struct{}), } bootstrapFilePath = bootstrap.StateDirPath(d.opts.EnvVars.StateDirPath) ) currBootstrap, err := bootstrap.FromFile(bootstrapFilePath) if errors.Is(err, fs.ErrNotExist) { // daemon has never had a network created or joined } else if err != nil { return nil, fmt.Errorf( "loading bootstrap from %q: %w", bootstrapFilePath, err, ) } else if err := d.initialize(currBootstrap, nil); err != nil { return nil, fmt.Errorf("initializing with bootstrap: %w", err) } return d, nil } // initialize must be called with d.l write lock held, _but_ the lock should be // released just after initialize returns. // // readyCh will be written to everytime daemon changes state to daemonStateOk, // unless it is nil or blocks. func (d *daemon) initialize( currBootstrap bootstrap.Bootstrap, readyCh chan<- struct{}, ) error { // we update this Host's data using whatever configuration has been provided // by the daemon config. This way the daemon has the most up-to-date // possible bootstrap. This updated bootstrap will later get updated in // garage as a background daemon task, so other hosts will see it as well. currBootstrap, err := coalesceDaemonConfigAndBootstrap( d.daemonConfig, currBootstrap, ) if err != nil { return fmt.Errorf("combining daemon configuration into bootstrap: %w", err) } err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, currBootstrap) if err != nil { return fmt.Errorf("writing bootstrap to state dir: %w", err) } d.currBootstrap = currBootstrap d.state = daemonStateInitializing ctx, cancel := context.WithCancel(context.Background()) d.wg.Add(1) go func() { defer d.wg.Done() <-d.shutdownCh cancel() }() d.wg.Add(1) go func() { defer d.wg.Done() d.restartLoop(ctx, readyCh) d.logger.Debug(ctx, "Daemon restart loop stopped") }() return nil } func withCurrBootstrap[Res any]( d *daemon, fn func(bootstrap.Bootstrap) (Res, error), ) (Res, error) { var zero Res d.l.RLock() defer d.l.RUnlock() currBootstrap, state := d.currBootstrap, d.state switch state { case daemonStateNoNetwork: return zero, ErrNoNetwork case daemonStateInitializing: return zero, ErrInitializing case daemonStateOk: return fn(currBootstrap) case daemonStateRestarting: return zero, ErrRestarting case daemonStateShutdown: return zero, errors.New("already shutdown") default: panic(fmt.Sprintf("unknown state %d", d.state)) } } // creates a new bootstrap file using available information from the network. If // the new bootstrap file is different than the existing one, the existing one // is overwritten and true is returned. func (d *daemon) checkBootstrap( ctx context.Context, hostBootstrap bootstrap.Bootstrap, ) ( bootstrap.Bootstrap, bool, error, ) { thisHost := hostBootstrap.ThisHost() newHosts, err := getGarageBootstrapHosts(ctx, d.logger, hostBootstrap) if err != nil { return bootstrap.Bootstrap{}, false, fmt.Errorf("getting hosts from garage: %w", err) } // the daemon's view of this host's bootstrap info takes precedence over // whatever is in garage newHosts[thisHost.Name] = thisHost newHostsHash, err := bootstrap.HostsHash(newHosts) if err != nil { return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of new hosts: %w", err) } currHostsHash, err := bootstrap.HostsHash(hostBootstrap.Hosts) if err != nil { return bootstrap.Bootstrap{}, false, fmt.Errorf("calculating hash of current hosts: %w", err) } if bytes.Equal(newHostsHash, currHostsHash) { return hostBootstrap, false, nil } hostBootstrap.Hosts = newHosts return hostBootstrap, true, nil } // blocks until a new bootstrap is available or context is canceled func (d *daemon) watchForChanges(ctx context.Context) bootstrap.Bootstrap { ticker := time.NewTicker(3 * time.Minute) defer ticker.Stop() for { select { case <-ctx.Done(): return bootstrap.Bootstrap{} case <-ticker.C: d.logger.Info(ctx, "Checking for changes to bootstrap") newBootstrap, changed, err := d.checkBootstrap( ctx, d.currBootstrap, ) if err != nil { d.logger.Error(ctx, "Checking bootstrap for changes failed", err) continue } else if !changed { continue } err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, newBootstrap) if err != nil { d.logger.Error(ctx, "Writing new bootstrap to disk failed", err) continue } return newBootstrap } } } func (d *daemon) postInit(ctx context.Context) bool { if len(d.daemonConfig.Storage.Allocations) > 0 { if !until( ctx, d.logger, "Applying garage layout", func(ctx context.Context) error { return GarageApplyLayout( ctx, d.logger, d.daemonConfig, d.currBootstrap, ) }, ) { return false } } // This is only necessary during network creation, otherwise the bootstrap // should already have these credentials built in. // // TODO this is pretty hacky, but there doesn't seem to be a better way to // manage it at the moment. if d.currBootstrap.Garage.GlobalBucketS3APICredentials == (garage.S3APICredentials{}) { currBootstrap := d.currBootstrap if !until( ctx, d.logger, "Initializing garage shared global bucket", func(ctx context.Context) error { garageGlobalBucketCreds, err := garageInitializeGlobalBucket( ctx, d.logger, d.daemonConfig, d.currBootstrap, ) if err != nil { return fmt.Errorf("initializing global bucket: %w", err) } currBootstrap.Garage.GlobalBucketS3APICredentials = garageGlobalBucketCreds d.logger.Info(ctx, "Writing bootstrap to state directory") err = writeBootstrapToStateDir( d.opts.EnvVars.StateDirPath, currBootstrap, ) if err != nil { return fmt.Errorf("writing bootstrap to state dir: %w", err) } return nil }, ) { return false } d.l.Lock() d.currBootstrap = currBootstrap d.l.Unlock() } if !until( ctx, d.logger, "Updating host info in garage", func(ctx context.Context) error { return d.putGarageBoostrapHost(ctx) }, ) { return false } return true } func (d *daemon) restartLoop(ctx context.Context, readyCh chan<- struct{}) { wait := func(d time.Duration) bool { select { case <-ctx.Done(): return false case <-time.After(d): return true } } ready := func() { select { case readyCh <- struct{}{}: default: } } for { if ctx.Err() != nil { return } d.logger.Info(ctx, "Creating child processes") children, err := NewChildren( ctx, d.logger.WithNamespace("children"), d.daemonConfig, d.currBootstrap, d.envBinDirPath, d.opts, ) if errors.Is(err, context.Canceled) { return } else if err != nil { d.logger.Error(ctx, "failed to initialize daemon", err) if !wait(1 * time.Second) { return } continue } d.logger.Info(ctx, "Child processes created") d.l.Lock() d.children = children d.l.Unlock() if !d.postInit(ctx) { return } d.l.Lock() d.state = daemonStateOk d.l.Unlock() ready() newBootstrap := d.watchForChanges(ctx) if ctx.Err() != nil { return } d.logger.Info(ctx, "Bootstrap has changed, will restart daemon") d.l.Lock() d.currBootstrap = newBootstrap d.state = daemonStateRestarting d.l.Unlock() d.logger.Info(ctx, "Shutting down previous child processes") if err := d.children.Shutdown(); err != nil { d.logger.Fatal(ctx, "Failed to cleanly shutdown children, there may be orphaned child processes", err) } // in case context was canceled while shutting the Children down, we // don't want the Shutdown method to re-attempt calling Shutdown on // it. d.children = nil } } func (d *daemon) CreateNetwork( ctx context.Context, name, domain, ipNetStr, hostName string, ) ( admin.Admin, error, ) { ip, subnet, err := net.ParseCIDR(ipNetStr) if err != nil { return admin.Admin{}, jsonrpc2.NewInvalidParamsError( "parsing %q as a CIDR: %v", ipNetStr, err, ) } if err := validateHostName(hostName); err != nil { return admin.Admin{}, jsonrpc2.NewInvalidParamsError( "invalid hostname: %v", err, ) } nebulaCACreds, err := nebula.NewCACredentials(domain, subnet) if err != nil { return admin.Admin{}, fmt.Errorf("creating nebula CA cert: %w", err) } adm := admin.Admin{ CreationParams: admin.CreationParams{ ID: randStr(32), Name: name, Domain: domain, }, } garageBootstrap := bootstrap.Garage{ RPCSecret: randStr(32), AdminToken: randStr(32), } hostBootstrap, err := bootstrap.New( nebulaCACreds, adm.CreationParams, garageBootstrap, hostName, ip, ) if err != nil { return adm, fmt.Errorf("initializing bootstrap data: %w", err) } d.l.Lock() if d.state != daemonStateNoNetwork { d.l.Unlock() return adm, ErrAlreadyJoined } if len(d.daemonConfig.Storage.Allocations) < 3 { d.l.Unlock() return adm, ErrInvalidConfig.WithData( "At least three storage allocations are required.", ) } readyCh := make(chan struct{}, 1) err = d.initialize(hostBootstrap, readyCh) d.l.Unlock() if err != nil { return adm, fmt.Errorf("initializing daemon: %w", err) } select { case <-readyCh: case <-ctx.Done(): return adm, ctx.Err() } // As part of postInit, which is called prior to ready(), the restartLoop // will check if the global bucket creds have been created yet or not, and // create them if so. So once ready() is called we can get the created creds // from the currBootstrap d.l.RLock() garageGlobalBucketCreds := d.currBootstrap.Garage.GlobalBucketS3APICredentials d.l.RUnlock() adm.Nebula.CACredentials = nebulaCACreds adm.Garage.RPCSecret = hostBootstrap.Garage.RPCSecret adm.Garage.GlobalBucketS3APICredentials = garageGlobalBucketCreds return adm, nil } func (d *daemon) JoinNetwork( ctx context.Context, newBootstrap bootstrap.Bootstrap, ) error { d.l.Lock() if d.state != daemonStateNoNetwork { d.l.Unlock() return ErrAlreadyJoined } readyCh := make(chan struct{}, 1) err := d.initialize(newBootstrap, readyCh) d.l.Unlock() if err != nil { return fmt.Errorf("initializing daemon: %w", err) } select { case <-readyCh: return nil case <-ctx.Done(): return ctx.Err() } } func (d *daemon) GetGarageBootstrapHosts( ctx context.Context, ) ( map[string]bootstrap.Host, error, ) { return withCurrBootstrap(d, func( currBootstrap bootstrap.Bootstrap, ) ( map[string]bootstrap.Host, error, ) { return getGarageBootstrapHosts(ctx, d.logger, currBootstrap) }) } func (d *daemon) Shutdown() error { d.l.Lock() defer d.l.Unlock() close(d.shutdownCh) d.wg.Wait() d.state = daemonStateShutdown if d.children != nil { if err := d.children.Shutdown(); err != nil { return fmt.Errorf("shutting down children: %w", err) } } return nil }