// Package daemon implements the isle daemon, which is a long-running service // managing all isle background tasks and sub-processes for a single network. package daemon import ( "bytes" "context" "crypto/rand" "errors" "fmt" "io" "io/fs" "isle/bootstrap" "isle/jsonutil" "isle/nebula" "isle/secrets" "net/netip" "os" "path/filepath" "sync" "time" "dev.mediocregopher.com/mediocre-go-lib.git/mlog" ) // CreateHostOpts are optional parameters to the CreateHost method. type CreateHostOpts struct { // IP address of the new host. An IP address will be randomly chosen if one // is not given here. IP netip.Addr // CanCreateHosts indicates that the bootstrap produced by CreateHost should // give the new host the ability to create new hosts as well. CanCreateHosts bool // TODO add nebula cert tags } // Daemon presents all functionality required for client frontends to interact // with isle, typically via the unix socket. type Daemon interface { // CreateNetwork will initialize a new network using the given parameters. // - name: Human-readable name of the network. // - domain: Primary domain name that network services are served under. // - ipNet: An IP subnet, in CIDR form, which will be the overall range of // possible IPs in the network. The first IP in this network range will // become this first host's IP. // - hostName: The name of this first host in the network. // // The daemon on which this is called will become the first host in the // network, and will have full administrative privileges. CreateNetwork( ctx context.Context, name, domain string, ipNet nebula.IPNet, hostName nebula.HostName, ) error // JoinNetwork joins the Daemon to an existing network using the given // Bootstrap. // // Errors: // - ErrAlreadyJoined JoinNetwork(context.Context, JoiningBootstrap) error // GetBootstraps returns the currently active Bootstrap. GetBootstrap(context.Context) (bootstrap.Bootstrap, error) // GetGarageClientParams returns a GarageClientParams for the current // network state. GetGarageClientParams(context.Context) (GarageClientParams, error) // RemoveHost removes the host of the given name from the network. RemoveHost(context.Context, nebula.HostName) error // CreateHost creates a bootstrap for a new host with the given name and IP // address. CreateHost( ctx context.Context, hostName nebula.HostName, opts CreateHostOpts, ) ( JoiningBootstrap, error, ) // CreateNebulaCertificate creates and signs a new nebula certficate for an // existing host, given the public key for that host. This is currently // mostly useful for creating certs for mobile devices. // // TODO replace this with CreateHostBootstrap, and the // CreateNebulaCertificate RPC method can just pull cert out of that. // // Errors: // - ErrHostNotFound CreateNebulaCertificate( ctx context.Context, hostName nebula.HostName, hostPubKey nebula.EncryptingPublicKey, ) ( nebula.Certificate, error, ) // Shutdown blocks until all resources held or created by the daemon, // including child processes it has started, have been cleaned up. // // If this returns an error then it's possible that child processes are // still running and are no longer managed. Shutdown() error } // Opts are optional parameters which can be passed in when initializing a new // Daemon instance. A nil Opts is equivalent to a zero value. type Opts struct { // Stdout and Stderr are what the associated outputs from child processes // will be directed to. Stdout, Stderr io.Writer EnvVars EnvVars // Defaults to that returned by GetEnvVars. } func (o *Opts) withDefaults() *Opts { if o == nil { o = new(Opts) } if o.Stdout == nil { o.Stdout = os.Stdout } if o.Stderr == nil { o.Stderr = os.Stderr } if o.EnvVars == (EnvVars{}) { o.EnvVars = GetEnvVars() } return o } const ( daemonStateNoNetwork = iota daemonStateInitializing daemonStateOk daemonStateShutdown ) type daemon struct { logger *mlog.Logger daemonConfig Config envBinDirPath string opts *Opts secretsStore secrets.Store garageAdminToken string l sync.RWMutex state int children *Children currBootstrap bootstrap.Bootstrap shutdownCh chan struct{} wg sync.WaitGroup } // NewDaemon initializes and returns a Daemon instance which will manage all // child processes and state required by the isle service, as well as an HTTP // socket over which RPC calls will be served. // // Inner Children instance(s) will be wrapped such that they will be // automatically shutdown and re-created whenever there's changes in the network // which require the configuration to be changed (e.g. a new nebula lighthouse). // During such an inner restart all methods will return ErrRestarting, except // Shutdown which will block until the currently executing restart is finished // and then shutdown cleanly from there. // // While still starting up the Daemon for the first time all methods will return // ErrInitializing, except Shutdown which will block until initialization is // canceled. func NewDaemon( ctx context.Context, logger *mlog.Logger, daemonConfig Config, envBinDirPath string, opts *Opts, ) ( Daemon, error, ) { var ( d = &daemon{ logger: logger, daemonConfig: daemonConfig, envBinDirPath: envBinDirPath, opts: opts.withDefaults(), garageAdminToken: randStr(32), shutdownCh: make(chan struct{}), } bootstrapFilePath = bootstrap.StateDirPath(d.opts.EnvVars.StateDirPath) ) if err := d.opts.EnvVars.init(); err != nil { return nil, fmt.Errorf("initializing daemon directories: %w", err) } var ( secretsPath = filepath.Join(d.opts.EnvVars.StateDirPath, "secrets") err error ) if d.secretsStore, err = secrets.NewFSStore(secretsPath); err != nil { return nil, fmt.Errorf( "initializing secrets store at %q: %w", secretsPath, err, ) } var currBootstrap bootstrap.Bootstrap err = jsonutil.LoadFile(&currBootstrap, bootstrapFilePath) if errors.Is(err, fs.ErrNotExist) { // daemon has never had a network created or joined } else if err != nil { return nil, fmt.Errorf( "loading bootstrap from %q: %w", bootstrapFilePath, err, ) } else if err := d.initialize(ctx, currBootstrap); err != nil { return nil, fmt.Errorf("initializing with bootstrap: %w", err) } return d, nil } // initialize must be called with d.l write lock held. func (d *daemon) initialize( ctx context.Context, currBootstrap bootstrap.Bootstrap, ) error { // we update this Host's data using whatever configuration has been provided // by the daemon config. This way the daemon has the most up-to-date // possible bootstrap. This updated bootstrap will later get updated in // garage as a background daemon task, so other hosts will see it as well. currBootstrap, err := coalesceDaemonConfigAndBootstrap( d.daemonConfig, currBootstrap, ) if err != nil { return fmt.Errorf("combining daemon configuration into bootstrap: %w", err) } err = writeBootstrapToStateDir(d.opts.EnvVars.StateDirPath, currBootstrap) if err != nil { return fmt.Errorf("writing bootstrap to state dir: %w", err) } d.currBootstrap = currBootstrap d.state = daemonStateInitializing d.logger.Info(ctx, "Creating child processes") d.children, err = NewChildren( ctx, d.logger.WithNamespace("children"), d.envBinDirPath, d.secretsStore, d.daemonConfig, d.garageAdminToken, currBootstrap, d.opts, ) if err != nil { return fmt.Errorf("creating child processes: %w", err) } d.logger.Info(ctx, "Child processes created") if err := d.postInit(ctx); err != nil { d.logger.Error(ctx, "Post-initialization failed, stopping child processes", err) d.children.Shutdown() return fmt.Errorf("performing post-initialization: %w", err) } d.state = daemonStateOk ctx, cancel := context.WithCancel(context.Background()) d.wg.Add(1) go func() { defer d.wg.Done() <-d.shutdownCh cancel() }() d.wg.Add(1) go func() { defer d.wg.Done() d.reloadLoop(ctx) d.logger.Debug(ctx, "Daemon restart loop stopped") }() return nil } func withCurrBootstrap[Res any]( d *daemon, fn func(bootstrap.Bootstrap) (Res, error), ) (Res, error) { var zero Res d.l.RLock() defer d.l.RUnlock() currBootstrap, state := d.currBootstrap, d.state switch state { case daemonStateNoNetwork: return zero, ErrNoNetwork case daemonStateInitializing: return zero, ErrInitializing case daemonStateOk: return fn(currBootstrap) case daemonStateShutdown: return zero, errors.New("already shutdown") default: panic(fmt.Sprintf("unknown state %d", d.state)) } } // reload will check the existing hosts data from currBootstrap against a // potentially updated set of hosts data, and if there are any differences will // perform whatever changes are necessary. func (d *daemon) reload( ctx context.Context, currBootstrap bootstrap.Bootstrap, newHosts map[nebula.HostName]bootstrap.Host, ) error { var ( newBootstrap = currBootstrap thisHost = currBootstrap.ThisHost() ) newBootstrap.Hosts = newHosts // the daemon's view of this host's bootstrap info takes precedence over // whatever is in garage newBootstrap.Hosts[thisHost.Name] = thisHost diff, err := calcBootstrapDiff(d.daemonConfig, currBootstrap, newBootstrap) if err != nil { return fmt.Errorf("calculating diff between bootstraps: %w", err) } else if diff == (bootstrapDiff{}) { d.logger.Info(ctx, "No changes to bootstrap detected") return nil } d.logger.Info(ctx, "Bootstrap has changed, storing new bootstrap") d.l.Lock() d.currBootstrap = newBootstrap d.l.Unlock() var errs []error // TODO each of these changed cases should block until its respective // service is confirmed to have come back online. // TODO it's possible that reload could be called concurrently, and one call // would override the reloading done by the other. if diff.dnsChanged { d.logger.Info(ctx, "Restarting dnsmasq to account for bootstrap changes") if err := d.children.RestartDNSMasq(newBootstrap); err != nil { errs = append(errs, fmt.Errorf("restarting dnsmasq: %w", err)) } } if diff.nebulaChanged { d.logger.Info(ctx, "Restarting nebula to account for bootstrap changes") if err := d.children.RestartNebula(newBootstrap); err != nil { errs = append(errs, fmt.Errorf("restarting nebula: %w", err)) } } return errors.Join(errs...) } func (d *daemon) postInit(ctx context.Context) error { if len(d.daemonConfig.Storage.Allocations) > 0 { d.logger.Info(ctx, "Applying garage layout") if err := garageApplyLayout( ctx, d.logger, d.daemonConfig, d.garageAdminToken, d.currBootstrap, ); err != nil { return fmt.Errorf("applying garage layout: %w", err) } } // This is only necessary during network creation, otherwise the bootstrap // should already have these credentials built in. // // TODO this is pretty hacky, but there doesn't seem to be a better way to // manage it at the moment. _, err := getGarageS3APIGlobalBucketCredentials(ctx, d.secretsStore) if errors.Is(err, secrets.ErrNotFound) { d.logger.Info(ctx, "Initializing garage shared global bucket") garageGlobalBucketCreds, err := garageInitializeGlobalBucket( ctx, d.logger, d.daemonConfig, d.garageAdminToken, d.currBootstrap, ) if err != nil { return fmt.Errorf("initializing global bucket: %w", err) } err = setGarageS3APIGlobalBucketCredentials( ctx, d.secretsStore, garageGlobalBucketCreds, ) if err != nil { return fmt.Errorf("storing global bucket creds: %w", err) } } d.logger.Info(ctx, "Updating host info in garage") err = d.putGarageBoostrapHost(ctx, d.currBootstrap) if err != nil { return fmt.Errorf("updating host info in garage: %w", err) } return nil } func (d *daemon) reloadLoop(ctx context.Context) { ticker := time.NewTicker(3 * time.Minute) defer ticker.Stop() for { select { case <-ctx.Done(): return case <-ticker.C: d.l.RLock() currBootstrap := d.currBootstrap d.l.RUnlock() d.logger.Info(ctx, "Checking for bootstrap changes") newHosts, err := d.getGarageBootstrapHosts(ctx, currBootstrap) if err != nil { d.logger.Error(ctx, "Failed to get hosts from garage", err) continue } // TODO there's some potential race conditions here, where // CreateHost could be called at this point, write the new host to // garage and the bootstrap, but then this reload call removes the // host from this bootstrap/children until the next reload. if err := d.reload(ctx, currBootstrap, newHosts); err != nil { d.logger.Error(ctx, "Reloading with new host data failed", err) continue } } } } func (d *daemon) CreateNetwork( ctx context.Context, name, domain string, ipNet nebula.IPNet, hostName nebula.HostName, ) error { nebulaCACreds, err := nebula.NewCACredentials(domain, ipNet) if err != nil { return fmt.Errorf("creating nebula CA cert: %w", err) } var ( creationParams = bootstrap.CreationParams{ ID: randStr(32), Name: name, Domain: domain, } garageRPCSecret = randStr(32) ) err = setGarageRPCSecret(ctx, d.secretsStore, garageRPCSecret) if err != nil { return fmt.Errorf("setting garage RPC secret: %w", err) } err = setNebulaCASigningPrivateKey(ctx, d.secretsStore, nebulaCACreds.SigningPrivateKey) if err != nil { return fmt.Errorf("setting nebula CA signing key secret: %w", err) } hostBootstrap, err := bootstrap.New( nebulaCACreds, creationParams, map[nebula.HostName]bootstrap.Host{}, hostName, ipNet.FirstAddr(), ) if err != nil { return fmt.Errorf("initializing bootstrap data: %w", err) } d.l.Lock() defer d.l.Unlock() if d.state != daemonStateNoNetwork { return ErrAlreadyJoined } if len(d.daemonConfig.Storage.Allocations) < 3 { return ErrInvalidConfig.WithData( "At least three storage allocations are required.", ) } // initialize will unlock d.l if err = d.initialize(ctx, hostBootstrap); err != nil { return fmt.Errorf("initializing daemon: %w", err) } return nil } func (d *daemon) JoinNetwork( ctx context.Context, newBootstrap JoiningBootstrap, ) error { d.l.Lock() defer d.l.Unlock() if d.state != daemonStateNoNetwork { return ErrAlreadyJoined } err := secrets.Import(ctx, d.secretsStore, newBootstrap.Secrets) if err != nil { return fmt.Errorf("importing secrets: %w", err) } // initialize will unlock d.l if err = d.initialize(ctx, newBootstrap.Bootstrap); err != nil { return fmt.Errorf("initializing daemon: %w", err) } return nil } func (d *daemon) GetBootstrap(ctx context.Context) (bootstrap.Bootstrap, error) { return withCurrBootstrap(d, func( currBootstrap bootstrap.Bootstrap, ) ( bootstrap.Bootstrap, error, ) { return currBootstrap, nil }) } func (d *daemon) GetGarageClientParams( ctx context.Context, ) ( GarageClientParams, error, ) { return withCurrBootstrap(d, func( currBootstrap bootstrap.Bootstrap, ) ( GarageClientParams, error, ) { return d.getGarageClientParams(ctx, currBootstrap) }) } func (d *daemon) RemoveHost(ctx context.Context, hostName nebula.HostName) error { // TODO RemoveHost should publish a certificate revocation for the host // being removed. _, err := withCurrBootstrap(d, func( currBootstrap bootstrap.Bootstrap, ) ( struct{}, error, ) { garageClientParams, err := d.getGarageClientParams(ctx, currBootstrap) if err != nil { return struct{}{}, fmt.Errorf("get garage client params: %w", err) } client := garageClientParams.GlobalBucketS3APIClient() return struct{}{}, removeGarageBootstrapHost(ctx, client, hostName) }) return err } func makeCACreds( currBootstrap bootstrap.Bootstrap, caSigningPrivateKey nebula.SigningPrivateKey, ) nebula.CACredentials { return nebula.CACredentials{ Public: currBootstrap.CAPublicCredentials, SigningPrivateKey: caSigningPrivateKey, } } func chooseAvailableIP(b bootstrap.Bootstrap) (netip.Addr, error) { var ( cidrIPNet = b.CAPublicCredentials.Cert.Unwrap().Details.Subnets[0] cidrMask = cidrIPNet.Mask cidrIPB = cidrIPNet.IP cidr = netip.MustParsePrefix(cidrIPNet.String()) cidrIP = cidr.Addr() cidrSuffixBits = cidrIP.BitLen() - cidr.Bits() inUseIPs = make(map[netip.Addr]struct{}, len(b.Hosts)) ) for _, host := range b.Hosts { inUseIPs[host.IP()] = struct{}{} } // first check that there are any addresses at all. We can determine the // number of possible addresses using the network CIDR. The first IP in a // subnet is the network identifier, and is reserved. The last IP is the // broadcast IP, and is also reserved. Hence, the -2. usableIPs := (1 << cidrSuffixBits) - 2 if len(inUseIPs) >= usableIPs { return netip.Addr{}, errors.New("no available IPs") } // We need to know the subnet broadcast address, so we don't accidentally // produce it. cidrBCastIPB := bytes.Clone(cidrIPB) for i := range cidrBCastIPB { cidrBCastIPB[i] |= ^cidrMask[i] } cidrBCastIP, ok := netip.AddrFromSlice(cidrBCastIPB) if !ok { panic(fmt.Sprintf("invalid broadcast ip calculated: %x", cidrBCastIP)) } // Try a handful of times to pick an IP at random. This is preferred, as it // leaves less room for two different CreateHost calls to choose the same // IP. for range 20 { b := make([]byte, len(cidrIPB)) if _, err := rand.Read(b); err != nil { return netip.Addr{}, fmt.Errorf("reading random bytes: %w", err) } for i := range b { b[i] = cidrIPB[i] | (b[i] & ^cidrMask[i]) } ip, ok := netip.AddrFromSlice(b) if !ok { panic(fmt.Sprintf("generated invalid IP: %x", b)) } else if !cidr.Contains(ip) { panic(fmt.Sprintf( "generated IP %v which is not in cidr %v", ip, cidr, )) } if ip == cidrIP || ip == cidrBCastIP { continue } if _, inUse := inUseIPs[ip]; !inUse { return ip, nil } } // If randomly picking fails then just go through IPs one by one until the // free one is found. for ip := cidrIP.Next(); ip != cidrBCastIP; ip = ip.Next() { if _, inUse := inUseIPs[ip]; !inUse { return ip, nil } } panic("All ips are in-use, but somehow that wasn't determined earlier") } func (d *daemon) CreateHost( ctx context.Context, hostName nebula.HostName, opts CreateHostOpts, ) ( JoiningBootstrap, error, ) { d.l.RLock() currBootstrap := d.currBootstrap d.l.RUnlock() ip := opts.IP if ip == (netip.Addr{}) { var err error if ip, err = chooseAvailableIP(currBootstrap); err != nil { return JoiningBootstrap{}, fmt.Errorf( "choosing available IP: %w", err, ) } } // TODO if the ip is given, check that it's not already in use. caSigningPrivateKey, err := getNebulaCASigningPrivateKey( ctx, d.secretsStore, ) if err != nil { return JoiningBootstrap{}, fmt.Errorf("getting CA signing key: %w", err) } var joiningBootstrap JoiningBootstrap joiningBootstrap.Bootstrap, err = bootstrap.New( makeCACreds(currBootstrap, caSigningPrivateKey), currBootstrap.NetworkCreationParams, currBootstrap.Hosts, hostName, ip, ) if err != nil { return JoiningBootstrap{}, fmt.Errorf( "initializing bootstrap data: %w", err, ) } secretsIDs := []secrets.ID{ garageRPCSecretSecretID, garageS3APIGlobalBucketCredentialsSecretID, } if opts.CanCreateHosts { secretsIDs = append(secretsIDs, nebulaCASigningPrivateKeySecretID) } if joiningBootstrap.Secrets, err = secrets.Export( ctx, d.secretsStore, secretsIDs, ); err != nil { return JoiningBootstrap{}, fmt.Errorf("exporting secrets: %w", err) } d.logger.Info(ctx, "Putting new host in garage") err = d.putGarageBoostrapHost(ctx, joiningBootstrap.Bootstrap) if err != nil { return JoiningBootstrap{}, fmt.Errorf("putting new host in garage: %w", err) } // the new bootstrap will have been initialized with both all existing hosts // (based on currBootstrap) and the host being created. newHosts := joiningBootstrap.Bootstrap.Hosts d.logger.Info(ctx, "Reloading local state with new host") if err := d.reload(ctx, currBootstrap, newHosts); err != nil { return JoiningBootstrap{}, fmt.Errorf("reloading child processes: %w", err) } return joiningBootstrap, nil } func (d *daemon) CreateNebulaCertificate( ctx context.Context, hostName nebula.HostName, hostPubKey nebula.EncryptingPublicKey, ) ( nebula.Certificate, error, ) { return withCurrBootstrap(d, func( currBootstrap bootstrap.Bootstrap, ) ( nebula.Certificate, error, ) { host, ok := currBootstrap.Hosts[hostName] if !ok { return nebula.Certificate{}, ErrHostNotFound } ip := host.IP() caSigningPrivateKey, err := getNebulaCASigningPrivateKey( ctx, d.secretsStore, ) if err != nil { return nebula.Certificate{}, fmt.Errorf("getting CA signing key: %w", err) } caCreds := makeCACreds(currBootstrap, caSigningPrivateKey) return nebula.NewHostCert(caCreds, hostPubKey, hostName, ip) }) } func (d *daemon) Shutdown() error { d.l.Lock() defer d.l.Unlock() close(d.shutdownCh) d.wg.Wait() d.state = daemonStateShutdown if d.children != nil { d.children.Shutdown() } return nil }