From 92802da2ddf97e305fb310dc71a30ccd55a7c0e9 Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Tue, 7 Jan 2025 15:40:50 +0100 Subject: [PATCH] Use GLM for garage layout management --- go/bootstrap/hosts.go | 10 +- go/daemon/children/children.go | 101 ++++- go/daemon/children/garage.go | 24 +- go/daemon/daecommon/config.go | 16 - go/daemon/network/bootstrap.go | 34 +- go/daemon/network/garage.go | 67 ++-- go/daemon/network/network.go | 353 +++++++++++++----- go/daemon/network/network_it_test.go | 259 ++++++------- go/daemon/network/network_it_util_test.go | 85 +++-- go/garage/admin_client.go | 1 + go/garage/garagesrv/garagesrv.go | 136 +++---- go/garage/peer.go | 34 +- .../validate-non-conflicting-ports-or-dirs.md | 10 + tasks/misc/storage-allocation-status.md | 10 + 14 files changed, 670 insertions(+), 470 deletions(-) create mode 100644 tasks/bugs/validate-non-conflicting-ports-or-dirs.md create mode 100644 tasks/misc/storage-allocation-status.md diff --git a/go/bootstrap/hosts.go b/go/bootstrap/hosts.go index f0ded15..af6b3a1 100644 --- a/go/bootstrap/hosts.go +++ b/go/bootstrap/hosts.go @@ -124,10 +124,12 @@ func (h Host) GarageNodes() []garage.RemoteNode { var nodes []garage.RemoteNode for _, instance := range h.Garage.Instances { nodes = append(nodes, garage.RemoteNode{ - ID: instance.ID, - IP: h.IP().String(), - RPCPort: instance.RPCPort, - S3APIPort: instance.S3APIPort, + Node: garage.Node{ + IP: h.IP().String(), + RPCPort: instance.RPCPort, + S3APIPort: instance.S3APIPort, + }, + ID: instance.ID, }) } return nodes diff --git a/go/daemon/children/children.go b/go/daemon/children/children.go index f782bad..36d716e 100644 --- a/go/daemon/children/children.go +++ b/go/daemon/children/children.go @@ -17,6 +17,7 @@ import ( "isle/bootstrap" "isle/daemon/daecommon" "isle/garage" + "isle/garage/garagesrv" "isle/secrets" "isle/toolkit" ) @@ -33,12 +34,28 @@ type Opts struct { // GarageNewCluster should be true if the garage instances being started // are the first instances in a cluster which is being created. GarageNewCluster bool + + // GarageBootstrapPeers will be used as the set of peers each garage + // instance should use to find the rest of the garage cluster. + // + // Defaults to peer information contained in the bootstrap hosts. + GarageBootstrapPeers []garage.RemoteNode + + // DEPRECATED can be used to manually set the db engine used by garage for + // new allocations. If not given then garagesrv.DBEngineSqlite will be used + // for new allocations. + GarageDefaultDBEngine garagesrv.DBEngine } func (o *Opts) withDefaults() *Opts { if o == nil { o = new(Opts) } + + if o.GarageDefaultDBEngine == "" { + o.GarageDefaultDBEngine = garageDefaultDBEngine + } + return o } @@ -53,6 +70,7 @@ type Children struct { runtimeDir toolkit.Dir garageAdminToken string nebulaDeviceNamer *NebulaDeviceNamer + opts *Opts garageRPCSecret string @@ -91,6 +109,7 @@ func New( runtimeDir: runtimeDir, garageAdminToken: garageAdminToken, nebulaDeviceNamer: nebulaDeviceNamer, + opts: opts, garageRPCSecret: garageRPCSecret, } @@ -125,6 +144,11 @@ func New( return nil, fmt.Errorf("starting dnsmasq: %w", err) } + garageBootstrapPeers := opts.GarageBootstrapPeers + if garageBootstrapPeers == nil { + garageBootstrapPeers = hostBootstrap.GarageNodes() + } + if c.garageProcs, err = garagePmuxProcs( ctx, c.logger, @@ -134,6 +158,8 @@ func New( networkConfig, garageAdminToken, hostBootstrap, + garageBootstrapPeers, + c.opts.GarageDefaultDBEngine, ); err != nil { logger.Warn(ctx, "Failed to start garage processes, shutting down child processes", err) c.Shutdown() @@ -209,24 +235,28 @@ func (c *Children) reloadGarage( networkConfig daecommon.NetworkConfig, hostBootstrap bootstrap.Bootstrap, ) error { - allocs := networkConfig.Storage.Allocations - if len(allocs) == 0 { - return nil - } - thisHost := hostBootstrap.ThisHost() + var ( + allocs = networkConfig.Storage.Allocations + thisHost = hostBootstrap.ThisHost() + anyStarted bool + allocsM = map[daecommon.ConfigStorageAllocation]struct{}{} + ) - var anyCreated bool for _, alloc := range allocs { + allocsM[alloc] = struct{}{} + var ( procName = garagePmuxProcName(alloc) ctx = mctx.Annotate( - ctx, + mctx.WithAnnotator(ctx, alloc), "garageProcName", procName, - "garageDataPath", alloc.DataPath, ) ) + // Rewrite the child config always, even if we don't always restart it. + // If nothing else this will capture any changes to the bootstrap nodes, + // which will be useful if garage gets restarted for any reason. childConfigPath, err := garageWriteChildConfig( ctx, c.logger, @@ -234,20 +264,31 @@ func (c *Children) reloadGarage( c.runtimeDir.Path, c.garageAdminToken, hostBootstrap, + hostBootstrap.GarageNodes(), alloc, + garageDefaultDBEngine, ) if err != nil { return fmt.Errorf("writing child config file for alloc %+v: %w", alloc, err) } - if _, ok := c.garageProcs[procName]; ok { - c.logger.Info(ctx, "Garage instance already exists") + if proc, ok := c.garageProcs[procName]; ok { + if proc.alloc == alloc { + c.logger.Info(ctx, "No changes to storage allocation, leaving garage process as-is") + continue + } + + c.logger.Info(ctx, "Storage allocation modified, restarting garage process") + proc.Restart() + anyStarted = true + + proc.alloc = alloc + c.garageProcs[procName] = proc + continue } - anyCreated = true - - c.logger.Info(ctx, "Garage config has been added, creating process") + c.logger.Info(ctx, "New storage allocation, creating garage process") c.garageProcs[procName] = garageProc{ Process: garagePmuxProc( ctx, c.logger, c.binDirPath, procName, childConfigPath, @@ -255,9 +296,26 @@ func (c *Children) reloadGarage( alloc: alloc, adminAddr: garageAllocAdminAddr(thisHost, alloc), } + + anyStarted = true } - if anyCreated { + for procName, proc := range c.garageProcs { + if _, ok := allocsM[proc.alloc]; ok { + continue + } + + ctx := mctx.Annotate( + mctx.WithAnnotator(ctx, proc.alloc), + "garageProcName", procName, + ) + + c.logger.Info(ctx, "Storage allocation removed, stopping garage process") + proc.Stop() + delete(c.garageProcs, procName) + } + + if anyStarted { if err := waitForGarage( ctx, c.logger, @@ -296,6 +354,21 @@ func (c *Children) Reload( return errors.Join(errs...) } +// ActiveStorageAllocations returns the storage allocations which currently have +// active garage instances. +func (c *Children) ActiveStorageAllocations() []daecommon.ConfigStorageAllocation { + allocs := make([]daecommon.ConfigStorageAllocation, 0, len(c.garageProcs)) + for _, proc := range c.garageProcs { + allocs = append(allocs, proc.alloc) + } + + slices.SortFunc(allocs, func(a, b daecommon.ConfigStorageAllocation) int { + return cmp.Compare(a.RPCPort, b.RPCPort) + }) + + return allocs +} + // GarageAdminClient returns an admin client for an active local garage process, // or false if there are no garage processes. func (c *Children) GarageAdminClient() (*garage.AdminClient, bool) { diff --git a/go/daemon/children/garage.go b/go/daemon/children/garage.go index fcc00b0..4e3286a 100644 --- a/go/daemon/children/garage.go +++ b/go/daemon/children/garage.go @@ -17,6 +17,10 @@ import ( "dev.mediocregopher.com/mediocre-go-lib.git/mlog" ) +const ( + garageDefaultDBEngine = garagesrv.DBEngineSqlite +) + func garageAdminClientLogger(logger *mlog.Logger) *mlog.Logger { return logger.WithNamespace("garageAdminClient") } @@ -42,9 +46,8 @@ func waitForGarage( ) ctx := mctx.Annotate( - ctx, + mctx.WithAnnotator(ctx, proc.alloc), "garageAdminAddr", proc.adminAddr, - "garageDataPath", proc.alloc.DataPath, ) logger.Info(ctx, "Waiting for garage instance to be healthy") @@ -70,17 +73,16 @@ func garageWriteChildConfig( logger *mlog.Logger, rpcSecret, runtimeDirPath, adminToken string, hostBootstrap bootstrap.Bootstrap, + bootstrapPeers []garage.RemoteNode, alloc daecommon.ConfigStorageAllocation, + defaultDBEngine garagesrv.DBEngine, ) ( string, error, ) { var ( thisHost = hostBootstrap.ThisHost() - id = daecommon.BootstrapGarageHostForAlloc(thisHost, alloc).ID - - node = garage.LocalNode{ - RemoteNode: garage.RemoteNode{ - ID: id, + node = garage.LocalNode{ + Node: garage.Node{ IP: thisHost.IP().String(), RPCPort: alloc.RPCPort, S3APIPort: alloc.S3APIPort, @@ -93,7 +95,7 @@ func garageWriteChildConfig( ) ) - dbEngine, err := garagesrv.GetDBEngine(alloc.MetaPath) + dbEngine, err := garagesrv.GetDBEngine(alloc.MetaPath, defaultDBEngine) if err != nil { return "", fmt.Errorf("getting alloc db engine: %w", err) } @@ -111,7 +113,7 @@ func garageWriteChildConfig( DBEngine: dbEngine, LocalNode: node, - BootstrapPeers: hostBootstrap.GarageNodes(), + BootstrapPeers: bootstrapPeers, }, ) @@ -152,6 +154,8 @@ func garagePmuxProcs( networkConfig daecommon.NetworkConfig, adminToken string, hostBootstrap bootstrap.Bootstrap, + bootstrapPeers []garage.RemoteNode, + defaultDBEngine garagesrv.DBEngine, ) ( map[string]garageProc, error, ) { @@ -171,7 +175,9 @@ func garagePmuxProcs( logger, rpcSecret, runtimeDirPath, adminToken, hostBootstrap, + bootstrapPeers, alloc, + defaultDBEngine, ) if err != nil { return nil, fmt.Errorf("writing child config file for alloc %+v: %w", alloc, err) diff --git a/go/daemon/daecommon/config.go b/go/daemon/daecommon/config.go index 7ded765..c5900e0 100644 --- a/go/daemon/daecommon/config.go +++ b/go/daemon/daecommon/config.go @@ -5,7 +5,6 @@ import ( "encoding/json" "fmt" "io" - "isle/bootstrap" "isle/toolkit" "isle/yamlutil" "net" @@ -325,18 +324,3 @@ func LoadConfig(userConfigPath string) (Config, error) { err := yamlutil.LoadYamlFile(&config, userConfigPath) return config, err } - -// BootstrapGarageHostForAlloc returns the bootstrap.GarageHostInstance which -// corresponds with the given alloc from the daemon config. This will panic if -// no associated instance can be found. -func BootstrapGarageHostForAlloc( - host bootstrap.Host, alloc ConfigStorageAllocation, -) bootstrap.GarageHostInstance { - for _, inst := range host.Garage.Instances { - if inst.RPCPort == alloc.RPCPort { - return inst - } - } - - panic(fmt.Sprintf("could not find alloc %+v in the bootstrap data", alloc)) -} diff --git a/go/daemon/network/bootstrap.go b/go/daemon/network/bootstrap.go index 9b51eb1..02f983a 100644 --- a/go/daemon/network/bootstrap.go +++ b/go/daemon/network/bootstrap.go @@ -63,28 +63,22 @@ func applyNetworkConfigToBootstrap( }, } - if allocs := networkConfig.Storage.Allocations; len(allocs) > 0 { - - for i, alloc := range allocs { - - id, rpcPort, err := garagesrv.InitAlloc(alloc.MetaPath, alloc.RPCPort) - if err != nil { - return bootstrap.Bootstrap{}, fmt.Errorf( - "initializing alloc at %q: %w", alloc.MetaPath, err, - ) - } - - host.Garage.Instances = append( - host.Garage.Instances, - bootstrap.GarageHostInstance{ - ID: id, - RPCPort: rpcPort, - S3APIPort: alloc.S3APIPort, - }, + for _, alloc := range networkConfig.Storage.Allocations { + id, err := garagesrv.LoadAllocID(alloc.MetaPath) + if err != nil { + return bootstrap.Bootstrap{}, fmt.Errorf( + "getting ID of alloc at %q: %w", alloc.MetaPath, err, ) - - allocs[i].RPCPort = rpcPort } + + host.Garage.Instances = append( + host.Garage.Instances, + bootstrap.GarageHostInstance{ + ID: id, + RPCPort: alloc.RPCPort, + S3APIPort: alloc.S3APIPort, + }, + ) } hostBootstrap.Hosts = maps.Clone(hostBootstrap.Hosts) diff --git a/go/daemon/network/garage.go b/go/daemon/network/garage.go index 220c8f6..2241680 100644 --- a/go/daemon/network/garage.go +++ b/go/daemon/network/garage.go @@ -9,9 +9,11 @@ import ( "isle/bootstrap" "isle/daemon/daecommon" "isle/garage" + "isle/garage/garagesrv" "isle/nebula" "isle/secrets" "isle/toolkit" + "net/netip" "path/filepath" "time" @@ -51,27 +53,47 @@ func getGarageClientParams( }, nil } -func garageApplyLayout( - ctx context.Context, - logger *mlog.Logger, - networkConfig daecommon.NetworkConfig, - adminClient *garage.AdminClient, - prevHost, currHost bootstrap.Host, -) error { - var ( - hostName = currHost.Name - allocs = networkConfig.Storage.Allocations - roles = make([]garage.Role, len(allocs)) - roleIDs = map[string]struct{}{} +func garageInitAllocs( + hostIP netip.Addr, + allocs []daecommon.ConfigStorageAllocation, +) ( + []garage.RemoteNode, error, +) { + peers := make([]garage.RemoteNode, len(allocs)) + for i, alloc := range allocs { + id, err := garagesrv.InitAlloc(alloc.MetaPath) + if err != nil { + return nil, fmt.Errorf("initializing alloc %+v: %w", alloc, err) + } - idsToRemove = make([]string, 0, len(prevHost.Garage.Instances)) + peers[i] = garage.RemoteNode{ + Node: garage.Node{ + IP: hostIP.String(), + RPCPort: alloc.RPCPort, + S3APIPort: alloc.S3APIPort, + }, + ID: id, + } + } + + return peers, nil +} + +func garageAllocsToRoles( + host bootstrap.Host, allocs []daecommon.ConfigStorageAllocation, +) ( + []garage.Role, error, +) { + var ( + hostName = host.Name + roles = make([]garage.Role, len(allocs)) ) - defer adminClient.Close() - for i, alloc := range allocs { - id := daecommon.BootstrapGarageHostForAlloc(currHost, alloc).ID - roleIDs[id] = struct{}{} + id, err := garagesrv.LoadAllocID(alloc.MetaPath) + if err != nil { + return nil, fmt.Errorf("getting ID of alloc %+v: %w", alloc, err) + } roles[i] = garage.Role{ ID: id, @@ -81,21 +103,12 @@ func garageApplyLayout( } } - for _, prevInst := range prevHost.Garage.Instances { - if _, ok := roleIDs[prevInst.ID]; !ok { - idsToRemove = append(idsToRemove, prevInst.ID) - } - } - - return adminClient.ApplyLayout(ctx, roles, idsToRemove) + return roles, nil } func garageInitializeGlobalBucket( ctx context.Context, - logger *mlog.Logger, - networkConfig daecommon.NetworkConfig, adminClient *garage.AdminClient, - host bootstrap.Host, ) ( garage.S3APICredentials, error, ) { diff --git a/go/daemon/network/network.go b/go/daemon/network/network.go index 1114463..57c51a5 100644 --- a/go/daemon/network/network.go +++ b/go/daemon/network/network.go @@ -14,7 +14,9 @@ import ( "isle/bootstrap" "isle/daemon/children" "isle/daemon/daecommon" + "isle/daemon/network/glm" "isle/garage" + "isle/garage/garagesrv" "isle/jsonutil" "isle/nebula" "isle/secrets" @@ -163,6 +165,9 @@ type Opts struct { // testBlocker is used by tests to set blockpoints. testBlocker *toolkit.TestBlocker + + // DEPRECATED See corresponding field in [children.Opts] + garageDefaultDBEngine garagesrv.DBEngine } func (o *Opts) withDefaults() *Opts { @@ -187,7 +192,8 @@ type network struct { opts *Opts - secretsStore secrets.Store + secretsStore secrets.Store + garageLayoutMgr glm.GarageLayoutManager l sync.RWMutex children *children.Children @@ -218,13 +224,15 @@ func newNetwork( ctx, cancel := context.WithCancel(context.WithoutCancel(ctx)) var ( - n = &network{ + ip = currBootstrap.ThisHost().IP() + n = &network{ logger: logger, envBinDirPath: envBinDirPath, nebulaDeviceNamer: nebulaDeviceNamer, stateDir: stateDir, runtimeDir: runtimeDir, opts: opts.withDefaults(), + garageLayoutMgr: glm.NewGarageLayoutManager(stateDir, ip), currBootstrap: currBootstrap, workerCtx: ctx, workerCancel: cancel, @@ -237,6 +245,12 @@ func newNetwork( return nil, fmt.Errorf("resolving network config: %w", err) } + if err := n.garageLayoutMgr.Validate( + ctx, n.networkConfig.Storage.Allocations, + ); err != nil { + return nil, ErrInvalidConfig.WithData(err.Error()) + } + secretsDir, err := n.stateDir.MkChildDir("secrets", dirsMayExist) if err != nil { return nil, fmt.Errorf("creating secrets dir: %w", err) @@ -399,6 +413,11 @@ func (constructorsImpl) create( ) } + err = n.garageLayoutMgr.Validate(ctx, n.networkConfig.Storage.Allocations) + if err != nil { + return nil, ErrInvalidConfig.WithData(err.Error()) + } + err = daecommon.SetGarageRPCSecret(ctx, n.secretsStore, garageRPCSecret) if err != nil { return nil, fmt.Errorf("setting garage RPC secret: %w", err) @@ -411,6 +430,12 @@ func (constructorsImpl) create( return nil, fmt.Errorf("setting nebula CA signing key secret: %w", err) } + if err = n.garageLayoutMgr.SetActiveAllocations( + ctx, n.networkConfig.Storage.Allocations, + ); err != nil { + return nil, fmt.Errorf("initializing GLM active allocations: %w", err) + } + if err := n.initialize(ctx, true); err != nil { return nil, fmt.Errorf("initializing with bootstrap: %w", err) } @@ -418,10 +443,12 @@ func (constructorsImpl) create( return n, nil } -// preChildrenInit performs steps which are required prior to children being -// initializes/reloaded. The lock must be held when this is called (if not being -// called as part of initialize). -func (n *network) preChildrenInit(ctx context.Context) error { +// updateBootstrapUnsafe updates both the locally saved bootstrap as well as +// this host's bootstrap host info in garage, first applying the network config +// to the bootstrap host info. +// +// Must be called with the lock held. +func (n *network) updateBootstrapUnsafe(ctx context.Context) error { var err error if n.currBootstrap, err = applyNetworkConfigToBootstrap( n.networkConfig, n.currBootstrap, @@ -436,61 +463,22 @@ func (n *network) preChildrenInit(ctx context.Context) error { return fmt.Errorf("writing bootstrap to state dir: %w", err) } + n.logger.Info(ctx, "Updating host info in garage") + if err := putGarageBoostrapHost( + ctx, n.secretsStore, n.currBootstrap, + ); err != nil { + return fmt.Errorf("updating host info in garage: %w", err) + } + return nil } -// postChildrenInit performs steps which are required after children have been -// initialized/reloaded. The lock must be held when this is called (if not being -// called as part of initialize). -func (n *network) postChildrenInit( - ctx context.Context, - prevThisHost bootstrap.Host, - createGarageGlobalBucket bool, +// see comment on garageWaitForAlloc +func (n *network) garageWaitForAllocs( + ctx context.Context, allocs []daecommon.ConfigStorageAllocation, ) error { - var ( - thisHost = n.currBootstrap.ThisHost() - garageAdminClient, hasGarage = n.children.GarageAdminClient() - ) - - if hasGarage { - defer garageAdminClient.Close() - } - - if hasGarage { - n.logger.Info(ctx, "Applying garage layout") - if err := garageApplyLayout( - ctx, - n.logger, - n.networkConfig, - garageAdminClient, - prevThisHost, thisHost, - ); err != nil { - return fmt.Errorf("applying garage layout: %w", err) - } - } - - if createGarageGlobalBucket { - n.logger.Info(ctx, "Initializing garage shared global bucket") - garageGlobalBucketCreds, err := garageInitializeGlobalBucket( - ctx, - n.logger, - n.networkConfig, - garageAdminClient, - thisHost, - ) - if err != nil { - return fmt.Errorf("initializing global bucket: %w", err) - } - - err = daecommon.SetGarageS3APIGlobalBucketCredentials( - ctx, n.secretsStore, garageGlobalBucketCreds, - ) - if err != nil { - return fmt.Errorf("storing global bucket creds: %w", err) - } - } - - for _, alloc := range n.networkConfig.Storage.Allocations { + var errs []error + for _, alloc := range allocs { garageAdminClient, ok := n.children.GarageAdminClientForAlloc(alloc) if !ok { return fmt.Errorf("no garage instance created for %+v", alloc) @@ -502,21 +490,99 @@ func (n *network) postChildrenInit( if err := garageWaitForAlloc( ctx, n.logger, garageAdminClient, ); err != nil { - return fmt.Errorf( + errs = append(errs, fmt.Errorf( "waiting for alloc %+v to initialize: %w", alloc, err, - ) + )) } } - n.logger.Info(ctx, "Updating host info in garage") - err := putGarageBoostrapHost(ctx, n.secretsStore, n.currBootstrap) + return errors.Join(errs...) +} + +// must hold lock to call this +func (n *network) glmStateTransitionUnsafe(ctx context.Context) error { + var knownNodes []garage.KnownNode + + if adminClient, ok := n.children.GarageAdminClient(); ok { + defer adminClient.Close() + + n.logger.Info(ctx, "Getting garage cluster status") + status, err := adminClient.Status(ctx) + if err != nil { + return fmt.Errorf("getting garage cluster state: %w", err) + } + + knownNodes = status.Nodes + } + + n.logger.Info(ctx, "Calculating garage layout state transition") + stateTx, err := n.garageLayoutMgr.CalculateStateTransition( + ctx, knownNodes, n.networkConfig.Storage.Allocations, + ) if err != nil { - return fmt.Errorf("updating host info in garage: %w", err) + return fmt.Errorf("getting next state tx: %w", err) + } + + childrenNetworkConfig := n.networkConfig + childrenNetworkConfig.Storage.Allocations = stateTx.ActiveAllocations() + + n.logger.Info(ctx, "Reloading children with updated storage allocations") + err = n.children.Reload(ctx, childrenNetworkConfig, n.currBootstrap) + if err != nil { + return fmt.Errorf("reloading children: %w", err) + } + + if adminClient, ok := n.children.GarageAdminClient(); ok { + defer adminClient.Close() + + var ( + host = n.currBootstrap.ThisHost() + // From garage's perspective a node is "removed" from the cluster by + // removing its role, which puts it into the draining state. + removeIDs = stateTx.DrainAllocationIDs() + ) + + addModifyRoles, err := garageAllocsToRoles( + host, stateTx.AddModifyAllocations, + ) + if err != nil { + return fmt.Errorf("converting allocs to roles: %w", err) + } + + n.logger.Info(ctx, "Applying state transition to garage layout") + if err := adminClient.ApplyLayout( + ctx, addModifyRoles, removeIDs, + ); err != nil { + return fmt.Errorf("applying state tx to layout: %w", err) + } + } else { + n.logger.Info(ctx, "No garage instances running, no layout changes to make") + } + + if err := n.garageWaitForAllocs( + ctx, n.networkConfig.Storage.Allocations, + ); err != nil { + return fmt.Errorf( + "waiting for garage allocations to fully initialize: %w", err, + ) + } + + n.logger.Info(ctx, "Committing state transition") + if err = n.garageLayoutMgr.CommitStateTransition( + ctx, stateTx, + ); err != nil { + return fmt.Errorf("commiting state tx: %w", err) } return nil } +func (n *network) glmStateTransition(ctx context.Context) error { + n.l.Lock() + defer n.l.Unlock() + return n.glmStateTransitionUnsafe(ctx) +} + func (n *network) reloadHosts(ctx context.Context) error { n.l.RLock() currBootstrap := n.currBootstrap @@ -541,8 +607,13 @@ func (n *network) reloadHosts(ctx context.Context) error { return fmt.Errorf("writing bootstrap to state dir: %w", err) } + childrenNetworkConfig, err := n.getChildrenNetworkConfig(ctx) + if err != nil { + return fmt.Errorf("getting network config for children: %w", err) + } + n.logger.Info(ctx, "Reloading child processes") - err = n.children.Reload(ctx, n.networkConfig, n.currBootstrap) + err = n.children.Reload(ctx, childrenNetworkConfig, n.currBootstrap) if err != nil { return fmt.Errorf( "reloading child processes: %w", err, @@ -566,8 +637,8 @@ func (n *network) periodically( ticker := time.NewTicker(period) defer ticker.Stop() - n.logger.Info(ctx, "Starting background job runner") - defer n.logger.Info(ctx, "Stopping background job runner") + n.logger.Info(ctx, "Starting background job worker") + defer n.logger.Info(ctx, "Stopping background job worker") for { select { @@ -575,7 +646,7 @@ func (n *network) periodically( return case <-ticker.C: - n.logger.Info(ctx, "Background job running") + n.logger.Info(ctx, "Background job worker") if err := fn(ctx); err != nil { n.logger.Error(ctx, "Background job failed", err) } @@ -584,14 +655,102 @@ func (n *network) periodically( }() } -func (n *network) initialize(ctx context.Context, isCreate bool) error { - var ( - prevThisHost = n.currBootstrap.ThisHost() - err error - ) +func (n *network) getChildrenNetworkConfig( + ctx context.Context, +) ( + daecommon.NetworkConfig, error, +) { + childrenNetworkConfig := n.networkConfig - if err := n.preChildrenInit(ctx); err != nil { - return fmt.Errorf("performing pre-initialization: %w", err) + activeStorageAllocs, err := n.garageLayoutMgr.GetActiveAllocations(ctx) + if err != nil { + return daecommon.NetworkConfig{}, fmt.Errorf( + "getting active storage allocations: %w", err, + ) + } + childrenNetworkConfig.Storage.Allocations = activeStorageAllocs + + return childrenNetworkConfig, nil +} + +func (n *network) initializePostChildren( + ctx context.Context, isCreate bool, +) error { + if !isCreate { + n.logger.Info(ctx, "Making any necessary changes to garage layout") + if err := n.glmStateTransitionUnsafe(ctx); err != nil { + return fmt.Errorf("performing garage layout transition: %w", err) + } + } else { + roles, err := garageAllocsToRoles( + n.currBootstrap.ThisHost(), n.networkConfig.Storage.Allocations, + ) + if err != nil { + return fmt.Errorf("converting allocs to roles: %w", err) + } + + garageAdminClient, _ := n.children.GarageAdminClient() + defer garageAdminClient.Close() + + n.logger.Info(ctx, "Applying initial garage layout") + if err := garageAdminClient.ApplyLayout(ctx, roles, nil); err != nil { + return fmt.Errorf("applying initial garage layout: %w", err) + } + + n.logger.Info(ctx, "Initializing garage shared global bucket") + garageGlobalBucketCreds, err := garageInitializeGlobalBucket( + ctx, garageAdminClient, + ) + if err != nil { + return fmt.Errorf("initializing global bucket: %w", err) + } + + if err = daecommon.SetGarageS3APIGlobalBucketCredentials( + ctx, n.secretsStore, garageGlobalBucketCreds, + ); err != nil { + return fmt.Errorf("storing global bucket creds: %w", err) + } + + n.logger.Info(ctx, "Waiting for garage instances to finish initializing") + if err := n.garageWaitForAllocs( + ctx, n.networkConfig.Storage.Allocations, + ); err != nil { + return fmt.Errorf( + "waiting for garage allocations to fully initialize: %w", err, + ) + } + } + + if err := n.updateBootstrapUnsafe(ctx); err != nil { + return fmt.Errorf("updating bootstrap: %w", err) + } + + // Do this now so that everything is stable before returning. This also + // serves a dual-purpose, as it makes sure that the above bootstrap update + // has propagated from the local garage instance, if any. + n.logger.Info(ctx, "Reloading hosts from network storage") + if err := n.reloadHosts(ctx); err != nil { + return fmt.Errorf("Reloading network bootstrap: %w", err) + } + + return nil +} + +func (n *network) initialize(ctx context.Context, isCreate bool) error { + childrenNetworkConfig, err := n.getChildrenNetworkConfig(ctx) + if err != nil { + return fmt.Errorf("getting network config for children: %w", err) + } + + var garageBootstrapPeers []garage.RemoteNode + if isCreate { + garageBootstrapPeers, err = garageInitAllocs( + n.currBootstrap.ThisHost().IP(), + n.networkConfig.Storage.Allocations, + ) + if err != nil { + return fmt.Errorf("initializing storage allocations: %w", err) + } } n.logger.Info(ctx, "Creating child processes") @@ -600,37 +759,29 @@ func (n *network) initialize(ctx context.Context, isCreate bool) error { n.logger.WithNamespace("children"), n.envBinDirPath, n.secretsStore, - n.networkConfig, + childrenNetworkConfig, n.runtimeDir, n.opts.GarageAdminToken, n.nebulaDeviceNamer, n.currBootstrap, &children.Opts{ - GarageNewCluster: isCreate, + GarageNewCluster: isCreate, + GarageBootstrapPeers: garageBootstrapPeers, + GarageDefaultDBEngine: n.opts.garageDefaultDBEngine, }, ) if err != nil { return fmt.Errorf("creating child processes: %w", err) } - createGarageGlobalBucket := isCreate - err = n.postChildrenInit(ctx, prevThisHost, createGarageGlobalBucket) - if err != nil { - n.logger.Error(ctx, "Post-initialization failed, stopping child processes", err) + if err := n.initializePostChildren(ctx, isCreate); err != nil { + n.logger.Error(ctx, "Failed to initialize Network, shutting down children", err) n.children.Shutdown() - return fmt.Errorf("performing post-initialization: %w", err) - } - - // Do this now so that everything is stable before returning. This also - // serves a dual-purpose, as it makes sure that the PUT from the - // postChildrenInit above has propagated from the local garage instance, if - // there is one. - n.logger.Info(ctx, "Reloading hosts from network storage") - if err = n.reloadHosts(ctx); err != nil { - return fmt.Errorf("Reloading network bootstrap: %w", err) + return err } n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute) + n.periodically("glmStateTransition", n.glmStateTransition, 10*time.Minute) return nil } @@ -924,10 +1075,11 @@ func (n *network) SetConfig( n.l.Lock() defer n.l.Unlock() - var ( - prevThisHost = n.currBootstrap.ThisHost() - err error - ) + if err := n.garageLayoutMgr.Validate( + ctx, config.Storage.Allocations, + ); err != nil { + return ErrInvalidConfig.WithData(err.Error()) + } if _, err := loadStoreConfig(n.stateDir, &config); err != nil { return fmt.Errorf("storing new config: %w", err) @@ -935,18 +1087,13 @@ func (n *network) SetConfig( n.networkConfig = config - if err := n.preChildrenInit(ctx); err != nil { - return fmt.Errorf("performing pre-initialization: %w", err) + n.logger.Info(ctx, "Making any necessary changes to garage layout") + if err := n.glmStateTransitionUnsafe(ctx); err != nil { + return fmt.Errorf("performing garage layout transition: %w", err) } - n.logger.Info(ctx, "Reloading child processes") - err = n.children.Reload(ctx, n.networkConfig, n.currBootstrap) - if err != nil { - return fmt.Errorf("reloading child processes: %w", err) - } - - if err := n.postChildrenInit(ctx, prevThisHost, false); err != nil { - return fmt.Errorf("performing post-initialization: %w", err) + if err := n.updateBootstrapUnsafe(ctx); err != nil { + return fmt.Errorf("updating bootstrap: %w", err) } return nil diff --git a/go/daemon/network/network_it_test.go b/go/daemon/network/network_it_test.go index 9d0bc0d..a43c2a9 100644 --- a/go/daemon/network/network_it_test.go +++ b/go/daemon/network/network_it_test.go @@ -15,7 +15,6 @@ import ( "time" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" ) func TestCreate(t *testing.T) { @@ -59,6 +58,42 @@ func TestLoad(t *testing.T) { assert.Equal(t, networkConfig, network.getConfig(t)) }) + + t.Run("garage lmdb db engine", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + network = h.createNetwork(t, "primus", &createNetworkOpts{ + garageDefaultDBEngine: garagesrv.DBEngineLMDB, + }) + metaPath = h.mkDir(t, "meta").Path + ) + + h.logger.Info(h.ctx, "Checking that garage is using the expected db engine") + garageConfig, err := os.ReadFile( + filepath.Join(network.runtimeDir.Path, "garage-3900.toml"), + ) + assert.NoError(t, err) + assert.Contains(t, + string(garageConfig), + `db_engine = "`+garagesrv.DBEngineLMDB+`"`, + ) + assert.NoFileExists(t, filepath.Join(metaPath, "db.sqlite")) + + network.opts.garageDefaultDBEngine = "" + network.restart(t) + + h.logger.Info(h.ctx, "Checking that garage is still using the expected db engine") + garageConfig, err = os.ReadFile( + filepath.Join(network.runtimeDir.Path, "garage-3900.toml"), + ) + assert.NoError(t, err) + assert.Contains(t, + string(garageConfig), + `db_engine = "`+garagesrv.DBEngineLMDB+`"`, + ) + assert.NoFileExists(t, filepath.Join(metaPath, "db.sqlite")) + }) + } func TestJoin(t *testing.T) { @@ -85,7 +120,7 @@ func TestJoin(t *testing.T) { }) ) - t.Log("reloading primus' hosts") + h.logger.Info(h.ctx, "reloading primus' hosts") assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx)) assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t)) @@ -94,57 +129,6 @@ func TestJoin(t *testing.T) { secondus: 1, }) }) - - // Assert that if primus runs the orphan remover at the same moment that - // secondus is joining that the layout applied by secondus doesn't get - // overwritten. - t.Run("with alloc/remove orphans after garage layout applied", func(t *testing.T) { - t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly") - - var ( - h = newIntegrationHarness(t) - primus = h.createNetwork(t, "primus", nil) - primusAdminClient = primus.garageAdminClient(t) - secondusBlocker = toolkit.NewTestBlocker(t) - ) - - secondusBlocker.ExpectBlockpoint("garageLayoutApplied").On( - t, h.ctx, func() { - h.logger.Info(h.ctx, "Waiting for new layout to propagate to primus") - err := toolkit.UntilTrue( - h.ctx, h.logger, 1*time.Second, func() (bool, error) { - layout, err := primusAdminClient.GetLayout(h.ctx) - if err != nil { - return false, fmt.Errorf("getting layout: %w", err) - } - - return len(layout.Roles) == 4, nil - }, - ) - - if !assert.NoError(t, err) { - return - } - - //h.logger.Info(h.ctx, "Calling removeOrphanGarageNodes") - //assert.NoError( - // t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx), - //) - }, - ) - - secondus := h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ - networkConfigOpts: &networkConfigOpts{ - numStorageAllocs: 1, - }, - blocker: secondusBlocker, - }) - - assertGarageLayout(t, map[*integrationHarnessNetwork]int{ - primus: 3, - secondus: 1, - }) - }) } func TestNetwork_GetBootstrap(t *testing.T) { @@ -220,7 +204,7 @@ func TestNetwork_SetConfig(t *testing.T) { assert.NoError(t, network.SetConfig(h.ctx, networkConfig)) - t.Log("Checking that the Host information was updated") + h.logger.Info(h.ctx, "Checking that the Host information was updated") newHostsByName := network.getHostsByName(t) newHost, ok := newHostsByName[network.hostName] assert.True(t, ok) @@ -236,20 +220,20 @@ func TestNetwork_SetConfig(t *testing.T) { RPCPort: 4901, }, newAlloc) - t.Log("Checking that the bootstrap file was written with the new host config") + h.logger.Info(h.ctx, "Checking that the bootstrap file was written with the new host config") var storedBootstrap bootstrap.Bootstrap assert.NoError(t, jsonutil.LoadFile( &storedBootstrap, bootstrap.StateDirPath(network.stateDir.Path), )) assert.Equal(t, newHostsByName, storedBootstrap.Hosts) - t.Log("Checking that garage layout contains the new allocation") + h.logger.Info(h.ctx, "Checking that garage layout contains the new allocation") expRoles := allocsToRoles(network.hostName, allocs) layout, err := network.garageAdminClient(t).GetLayout(h.ctx) assert.NoError(t, err) assert.ElementsMatch(t, expRoles, layout.Roles) - t.Log("Checking that garage is using the expected db engine") + h.logger.Info(h.ctx, "Checking that garage is using the expected db engine") garageConfig, err := os.ReadFile( filepath.Join(network.runtimeDir.Path, "garage-4901.toml"), ) @@ -261,46 +245,6 @@ func TestNetwork_SetConfig(t *testing.T) { assert.FileExists(t, filepath.Join(metaPath, "db.sqlite")) }) - t.Run("add storage alloc/lmdb", func(t *testing.T) { - var ( - h = newIntegrationHarness(t) - network = h.createNetwork(t, "primus", nil) - networkConfig = network.getConfig(t) - dataPath = h.mkDir(t, "data").Path - metaPath = h.mkDir(t, "meta").Path - ) - - networkConfig.Storage.Allocations = append( - networkConfig.Storage.Allocations, - daecommon.ConfigStorageAllocation{ - DataPath: dataPath, - MetaPath: metaPath, - Capacity: 1, - S3APIPort: 4900, - RPCPort: 4901, - AdminPort: 4902, - }, - ) - - // Creating the directory is enough to ensure that Isle chooses LMDB as - // the db engine. - lmdbPath := filepath.Join(metaPath, "db.lmdb") - require.NoError(t, os.Mkdir(lmdbPath, 0755)) - - assert.NoError(t, network.SetConfig(h.ctx, networkConfig)) - - t.Log("Checking that garage is using the expected db engine") - garageConfig, err := os.ReadFile( - filepath.Join(network.runtimeDir.Path, "garage-4901.toml"), - ) - assert.NoError(t, err) - assert.Contains(t, - string(garageConfig), - `db_engine = "`+garagesrv.DBEngineLMDB+`"`, - ) - assert.NoFileExists(t, filepath.Join(metaPath, "db.sqlite")) - }) - t.Run("remove storage alloc", func(t *testing.T) { var ( h = newIntegrationHarness(t) @@ -309,17 +253,21 @@ func TestNetwork_SetConfig(t *testing.T) { }) networkConfig = network.getConfig(t) - prevHost = network.getHostsByName(t)[network.hostName] - removedAlloc = networkConfig.Storage.Allocations[3] - removedGarageInst = daecommon.BootstrapGarageHostForAlloc( - prevHost, removedAlloc, - ) + prevHost = network.getHostsByName(t)[network.hostName] + removedAlloc = networkConfig.Storage.Allocations[3] ) + var removedGarageInst bootstrap.GarageHostInstance + for _, removedGarageInst = range prevHost.Garage.Instances { + if removedGarageInst.RPCPort == removedAlloc.RPCPort { + break + } + } + networkConfig.Storage.Allocations = networkConfig.Storage.Allocations[:3] assert.NoError(t, network.SetConfig(h.ctx, networkConfig)) - t.Log("Checking that the Host information was updated") + h.logger.Info(h.ctx, "Checking that the Host information was updated") newHostsByName := network.getHostsByName(t) newHost, ok := newHostsByName[network.hostName] assert.True(t, ok) @@ -328,68 +276,20 @@ func TestNetwork_SetConfig(t *testing.T) { assert.Len(t, allocs, 3) assert.NotContains(t, allocs, removedGarageInst) - t.Log("Checking that the bootstrap file was written with the new host config") + h.logger.Info(h.ctx, "Checking that the bootstrap file was written with the new host config") var storedBootstrap bootstrap.Bootstrap assert.NoError(t, jsonutil.LoadFile( &storedBootstrap, bootstrap.StateDirPath(network.stateDir.Path), )) assert.Equal(t, newHostsByName, storedBootstrap.Hosts) - t.Log("Checking that garage layout contains the new allocation") + h.logger.Info(h.ctx, "Checking that garage layout contains the new allocation") expRoles := allocsToRoles(network.hostName, allocs) layout, err := network.garageAdminClient(t).GetLayout(h.ctx) assert.NoError(t, err) assert.ElementsMatch(t, expRoles, layout.Roles) }) - t.Run("remove all storage allocs", func(t *testing.T) { - t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly") - - var ( - h = newIntegrationHarness(t) - primus = h.createNetwork(t, "primus", nil) - secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ - networkConfigOpts: &networkConfigOpts{ - numStorageAllocs: 1, - }, - }) - networkConfig = secondus.getConfig(t) - - prevHost = secondus.getHostsByName(t)[secondus.hostName] - removedRole = allocsToRoles( - secondus.hostName, prevHost.Garage.Instances, - )[0] - - primusGarageAdminClient = primus.garageAdminClient(t) - ) - - networkConfig.Storage.Allocations = nil - assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig)) - - t.Log("Checking that the Host information was updated") - newHostsByName := primus.getHostsByName(t) - newHost, ok := newHostsByName[secondus.hostName] - assert.True(t, ok) - - allocs := newHost.HostConfigured.Garage.Instances - assert.Empty(t, allocs) - - t.Log("Checking that garage layout still contains the old allocation") - layout, err := primusGarageAdminClient.GetLayout(h.ctx) - assert.NoError(t, err) - assert.Contains(t, layout.Roles, removedRole) - - //t.Log("Removing orphan garage nodes with primus") - //assert.NoError( - // t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx), - //) - - t.Log("Checking that garage layout no longer contains the old allocation") - layout, err = primusGarageAdminClient.GetLayout(h.ctx) - assert.NoError(t, err) - assert.NotContains(t, layout.Roles, removedRole) - }) - t.Run("changes reflected after restart", func(t *testing.T) { var ( h = newIntegrationHarness(t) @@ -408,3 +308,56 @@ func TestNetwork_SetConfig(t *testing.T) { assert.Equal(t, networkConfig, network.getConfig(t)) }) } + +func TestNetwork_glmStateTransition(t *testing.T) { + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ + networkConfigOpts: &networkConfigOpts{ + numStorageAllocs: 1, + }, + }) + secondusNetworkConfig = secondus.getConfig(t) + secondusAdminClient = secondus.garageAdminClient(t) + secondusNetworkDirect = secondus.Network.(*network) + secondsBootstrapHost = primus.getHostsByName(t)["secondus"] + ) + + assertGarageLayout(t, map[*integrationHarnessNetwork]int{ + primus: 3, + secondus: 1, + }) + + secondusNetworkConfig.Storage.Allocations = nil + assert.NoError(t, secondus.SetConfig(h.ctx, secondusNetworkConfig)) + + assert.Len(t, secondusNetworkDirect.children.ActiveStorageAllocations(), 1) + + h.logger.Info(h.ctx, "Waiting for secondus to finish draining") + err := toolkit.UntilTrue( + h.ctx, h.logger, 1*time.Second, func() (bool, error) { + status, err := secondusAdminClient.Status(h.ctx) + if err != nil { + return false, fmt.Errorf("getting status: %w", err) + } + + for _, node := range status.Nodes { + if node.Addr.Addr() == secondsBootstrapHost.IP() { + return !node.Draining, nil + } + } + + return false, fmt.Errorf("secondus not found in cluster status: %+v", status) + }, + ) + assert.NoError(t, err) + + h.logger.Info(h.ctx, "Running GLM state transition") + assert.NoError(t, secondusNetworkDirect.glmStateTransition(h.ctx)) + + assertGarageLayout(t, map[*integrationHarnessNetwork]int{ + primus: 3, + }) + assert.Empty(t, secondusNetworkDirect.children.ActiveStorageAllocations()) +} diff --git a/go/daemon/network/network_it_util_test.go b/go/daemon/network/network_it_util_test.go index 39a7961..62b4789 100644 --- a/go/daemon/network/network_it_util_test.go +++ b/go/daemon/network/network_it_util_test.go @@ -8,6 +8,7 @@ import ( "isle/daemon/children" "isle/daemon/daecommon" "isle/garage" + "isle/garage/garagesrv" "isle/nebula" "isle/toolkit" "os" @@ -17,6 +18,7 @@ import ( "sync/atomic" "testing" + "dev.mediocregopher.com/mediocre-go-lib.git/mctx" "dev.mediocregopher.com/mediocre-go-lib.git/mlog" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -75,24 +77,32 @@ func newIntegrationHarness(t *testing.T) *integrationHarness { t.Parallel() toolkit.MarkIntegrationTest(t) + var ( + ctx = context.Background() + logger = toolkit.NewTestLogger(t) + ) + rootDir, err := os.MkdirTemp("", "isle-network-it-test.*") require.NoError(t, err) - t.Logf("Temporary test directory: %q", rootDir) + { + ctx := mctx.Annotate(ctx, "rootDir", rootDir) + logger.Info(ctx, "Created temporary test root directory") - t.Cleanup(func() { - if t.Failed() { - t.Logf("Temp directory for failed test not deleted: %q", rootDir) - return - } + t.Cleanup(func() { + if t.Failed() { + logger.Info(ctx, "Test failed, temporarty test root directory NOT deleted") + return + } - t.Logf("Deleting temp directory %q", rootDir) - assert.NoError(t, os.RemoveAll(rootDir)) - }) + logger.Info(ctx, "Deleting temporary test root directory") + assert.NoError(t, os.RemoveAll(rootDir)) + }) + } return &integrationHarness{ - ctx: context.Background(), - logger: toolkit.NewTestLogger(t), + ctx: ctx, + logger: logger, constructors: newConstructors(), rootDir: toolkit.Dir{Path: rootDir}, nebulaDeviceNamer: children.NewNebulaDeviceNamer(), @@ -102,7 +112,10 @@ func newIntegrationHarness(t *testing.T) *integrationHarness { func (h *integrationHarness) mkDir(t *testing.T, name string) toolkit.Dir { fullName := fmt.Sprintf("%s-%d", name, h.dirCounter.Add(1)-1) - t.Logf("Creating directory %q", fullName) + h.logger.Info( + mctx.Annotate(h.ctx, "dirName", fullName), + "Creating directory", + ) d, err := h.rootDir.MkChildDir(fullName, false) require.NoError(t, err) @@ -149,9 +162,10 @@ func (h *integrationHarness) mkNetworkConfig( } type createNetworkOpts struct { - creationParams bootstrap.CreationParams - manualShutdown bool - numStorageAllocs int + creationParams bootstrap.CreationParams + manualShutdown bool + numStorageAllocs int + garageDefaultDBEngine garagesrv.DBEngine } func (o *createNetworkOpts) withDefaults() *createNetworkOpts { @@ -187,11 +201,10 @@ func (h *integrationHarness) createNetwork( hostNameStr string, opts *createNetworkOpts, ) *integrationHarnessNetwork { - t.Logf("Creating as %q", hostNameStr) opts = opts.withDefaults() var ( - logger = h.logger.WithNamespace("network").WithNamespace(hostNameStr) + logger = h.logger.WithNamespace("networks").WithNamespace(hostNameStr) networkConfig = h.mkNetworkConfig(t, &networkConfigOpts{ hasPublicAddr: true, numStorageAllocs: opts.numStorageAllocs, @@ -204,11 +217,13 @@ func (h *integrationHarness) createNetwork( hostName = nebula.HostName(hostNameStr) networkOpts = &Opts{ - GarageAdminToken: "admin_token", - Config: &networkConfig, + GarageAdminToken: "admin_token", + Config: &networkConfig, + garageDefaultDBEngine: opts.garageDefaultDBEngine, } ) + logger.Info(h.ctx, "Creating network") network, err := h.constructors.create( h.ctx, logger, @@ -239,9 +254,9 @@ func (h *integrationHarness) createNetwork( if !opts.manualShutdown { t.Cleanup(func() { - t.Logf("Shutting down Network %q", hostNameStr) + logger.Info(nh.ctx, "Shutting down Network") if err := nh.Shutdown(); err != nil { - t.Logf("Shutting down Network %q failed: %v", hostNameStr, err) + logger.Error(nh.ctx, "Shutting down Network failed", err) } }) } @@ -273,18 +288,24 @@ func (h *integrationHarness) joinNetwork( opts *joinNetworkOpts, ) *integrationHarnessNetwork { opts = opts.withDefaults() - hostName := nebula.HostName(hostNameStr) - t.Logf("Creating bootstrap for %q", hostNameStr) - joiningBootstrap, err := network.CreateHost(h.ctx, hostName, CreateHostOpts{ - CanCreateHosts: opts.canCreateHosts, - }) + var ( + hostName = nebula.HostName(hostNameStr) + createHostCtx = mctx.Annotate(h.ctx, "hostName", hostName) + ) + + h.logger.Info(createHostCtx, "Creating bootstrap") + joiningBootstrap, err := network.CreateHost( + createHostCtx, hostName, CreateHostOpts{ + CanCreateHosts: opts.canCreateHosts, + }, + ) if err != nil { t.Fatalf("creating host joining bootstrap: %v", err) } var ( - logger = h.logger.WithNamespace("network").WithNamespace(hostNameStr) + logger = h.logger.WithNamespace("networks").WithNamespace(hostNameStr) networkConfig = h.mkNetworkConfig(t, opts.networkConfigOpts) stateDir = h.mkDir(t, "state") runtimeDir = h.mkDir(t, "runtime") @@ -295,7 +316,7 @@ func (h *integrationHarness) joinNetwork( } ) - t.Logf("Joining as %q", hostNameStr) + logger.Info(h.ctx, "Joining") joinedNetwork, err := h.constructors.join( h.ctx, logger, @@ -324,9 +345,9 @@ func (h *integrationHarness) joinNetwork( if !opts.manualShutdown { t.Cleanup(func() { - t.Logf("Shutting down Network %q", hostNameStr) + nh.logger.Info(nh.ctx, "Shutting down Network") if err := nh.Shutdown(); err != nil { - t.Logf("Shutting down Network %q failed: %v", hostNameStr, err) + nh.logger.Error(nh.ctx, "Shutting down Network failed", err) } }) } @@ -335,10 +356,10 @@ func (h *integrationHarness) joinNetwork( } func (nh *integrationHarnessNetwork) restart(t *testing.T) { - t.Log("Shutting down network (restart)") + nh.logger.Info(nh.ctx, "Shutting down network (restart)") require.NoError(t, nh.Network.Shutdown()) - t.Log("Loading network (restart)") + nh.logger.Info(nh.ctx, "Loading network (restart)") var err error nh.Network, err = nh.constructors.load( nh.ctx, diff --git a/go/garage/admin_client.go b/go/garage/admin_client.go index 005c9f9..e63a546 100644 --- a/go/garage/admin_client.go +++ b/go/garage/admin_client.go @@ -141,6 +141,7 @@ type KnownNode struct { Addr netip.AddrPort `json:"addr"` IsUp bool `json:"isUp"` LastSeenSecsAgo int `json:"lastSeenSecsAgo"` + Draining bool `json:"draining"` HostName string `json:"hostname"` } diff --git a/go/garage/garagesrv/garagesrv.go b/go/garage/garagesrv/garagesrv.go index be9a435..3df7d4c 100644 --- a/go/garage/garagesrv/garagesrv.go +++ b/go/garage/garagesrv/garagesrv.go @@ -11,7 +11,6 @@ import ( "io/fs" "os" "path/filepath" - "strconv" ) // DBEngine enumerates the garage db engines which are supported by Isle. @@ -31,13 +30,9 @@ func nodeKeyPubPath(metaDirPath string) string { return filepath.Join(metaDirPath, "node_key.pub") } -func nodeRPCPortPath(metaDirPath string) string { - return filepath.Join(metaDirPath, "isle", "rpc_port") -} - -// loadAllocID returns the peer ID (ie the public key) of the node at the given +// LoadAllocID returns the peer ID (ie the public key) of the node at the given // meta directory. -func loadAllocID(metaDirPath string) (string, error) { +func LoadAllocID(metaDirPath string) (string, error) { nodeKeyPubPath := nodeKeyPubPath(metaDirPath) pubKey, err := os.ReadFile(nodeKeyPubPath) @@ -61,8 +56,8 @@ func generatePeerKey() (pubKey, privKey []byte) { // InitAlloc initializes the meta directory and keys for a particular // allocation, if it hasn't been done so already. It returns the peer ID (ie the -// public key) and the rpc port in any case. -func InitAlloc(metaDirPath string, initRPCPort int) (string, int, error) { +// public key) in any case. +func InitAlloc(metaDirPath string) (string, error) { initDirFor := func(path string) error { dir := filepath.Dir(path) @@ -70,110 +65,97 @@ func InitAlloc(metaDirPath string, initRPCPort int) (string, int, error) { } var err error - exists := func(path string) bool { - if err != nil { return false - } else if _, err = os.Stat(path); errors.Is(err, fs.ErrNotExist) { err = nil return false - } else if err != nil { err = fmt.Errorf("checking if %q exists: %w", path, err) return false } - return true } - nodeKeyPath := nodeKeyPath(metaDirPath) - nodeKeyPubPath := nodeKeyPubPath(metaDirPath) - nodeRPCPortPath := nodeRPCPortPath(metaDirPath) - - nodeKeyPathExists := exists(nodeKeyPath) - nodeKeyPubPathExists := exists(nodeKeyPubPath) - nodeRPCPortPathExists := exists(nodeRPCPortPath) - - if err != nil { - return "", 0, err - - } else if nodeKeyPubPathExists != nodeKeyPathExists { - return "", 0, fmt.Errorf("%q or %q exist without the other existing", nodeKeyPath, nodeKeyPubPath) - - } - var ( - pubKeyStr string - rpcPort int + nodeKeyPath = nodeKeyPath(metaDirPath) + nodeKeyPubPath = nodeKeyPubPath(metaDirPath) + + nodeKeyPathExists = exists(nodeKeyPath) + nodeKeyPubPathExists = exists(nodeKeyPubPath) ) - if nodeKeyPathExists { + if err != nil { + return "", err + } else if nodeKeyPubPathExists != nodeKeyPathExists { + return "", fmt.Errorf( + "%q or %q exist without the other existing", + nodeKeyPath, + nodeKeyPubPath, + ) + } - if pubKeyStr, err = loadAllocID(metaDirPath); err != nil { - return "", 0, fmt.Errorf("reading node public key file: %w", err) + var pubKeyStr string + if nodeKeyPathExists { + if pubKeyStr, err = LoadAllocID(metaDirPath); err != nil { + return "", fmt.Errorf("reading node public key file: %w", err) } } else { - if err := initDirFor(nodeKeyPath); err != nil { - return "", 0, fmt.Errorf("creating directory for %q: %w", nodeKeyPath, err) + return "", fmt.Errorf( + "creating directory for %q: %w", nodeKeyPath, err, + ) } pubKey, privKey := generatePeerKey() - if err := os.WriteFile(nodeKeyPath, privKey, 0400); err != nil { - return "", 0, fmt.Errorf("writing private key to %q: %w", nodeKeyPath, err) + return "", fmt.Errorf( + "writing private key to %q: %w", nodeKeyPath, err, + ) } else if err := os.WriteFile(nodeKeyPubPath, pubKey, 0440); err != nil { - return "", 0, fmt.Errorf("writing public key to %q: %w", nodeKeyPubPath, err) + return "", fmt.Errorf( + "writing public key to %q: %w", nodeKeyPubPath, err, + ) } pubKeyStr = hex.EncodeToString(pubKey) } - if nodeRPCPortPathExists { - - if rpcPortStr, err := os.ReadFile(nodeRPCPortPath); err != nil { - return "", 0, fmt.Errorf("reading rpc port from %q: %w", nodeRPCPortPath, err) - - } else if rpcPort, err = strconv.Atoi(string(rpcPortStr)); err != nil { - return "", 0, fmt.Errorf("parsing rpc port %q from %q: %w", rpcPortStr, nodeRPCPortPath, err) - } - - } else { - - if err := initDirFor(nodeRPCPortPath); err != nil { - return "", 0, fmt.Errorf("creating directory for %q: %w", nodeRPCPortPath, err) - } - - rpcPortStr := strconv.Itoa(initRPCPort) - - if err := os.WriteFile(nodeRPCPortPath, []byte(rpcPortStr), 0440); err != nil { - return "", 0, fmt.Errorf("writing rpc port %q to %q: %w", rpcPortStr, nodeRPCPortPath, err) - } - - rpcPort = initRPCPort - } - - return pubKeyStr, rpcPort, nil + return pubKeyStr, nil } // GetDBEngine returns the DBEngine currently being used at a particular meta -// data directory. Defaults to DBEngineSqlite if the directory doesn't exist or +// data directory, or returns the default if the directory doesn't exist or // hasn't been fully initialized yet. -func GetDBEngine(metaDirPath string) (DBEngine, error) { - dbLMDBPath := filepath.Join(metaDirPath, "db.lmdb") - - stat, err := os.Stat(dbLMDBPath) - if errors.Is(err, fs.ErrNotExist) { - return DBEngineSqlite, nil - } else if err != nil { - return "", fmt.Errorf("checking if %q exists: %w", dbLMDBPath, err) - } else if stat.IsDir() { - return DBEngineLMDB, nil +func GetDBEngine( + metaDirPath string, defaultDBEngine DBEngine, +) ( + DBEngine, error, +) { + search := []struct { + dbEngine DBEngine + path string + pathIsDir bool + }{ + {DBEngineLMDB, filepath.Join(metaDirPath, "db.lmdb"), true}, + {DBEngineSqlite, filepath.Join(metaDirPath, "db.sqlite"), false}, } - return DBEngineSqlite, nil + for _, s := range search { + stat, err := os.Stat(s.path) + if errors.Is(err, fs.ErrNotExist) { + continue + } else if err != nil { + return "", fmt.Errorf("checking if %q exists: %w", s.path, err) + } else if stat.IsDir() != s.pathIsDir { + continue + } + + return s.dbEngine, nil + } + + return defaultDBEngine, nil } diff --git a/go/garage/peer.go b/go/garage/peer.go index 4481281..a0a35f1 100644 --- a/go/garage/peer.go +++ b/go/garage/peer.go @@ -6,25 +6,28 @@ import ( "strconv" ) -// RemoteNode describes all information necessary to connect to a given garage -// node. -type RemoteNode struct { - ID string +// Node describes the common fields of both a [RemoteNode] and [LocalNode] +type Node struct { IP string RPCPort int S3APIPort int } -// LocalNode describes the configuration of a local garage instance. -type LocalNode struct { - RemoteNode - - AdminPort int +// RPCAddr returns the address of the node's RPC port. +func (p Node) RPCAddr() string { + return net.JoinHostPort(p.IP, strconv.Itoa(p.RPCPort)) } -// RPCAddr returns the address of the node's RPC port. -func (p RemoteNode) RPCAddr() string { - return net.JoinHostPort(p.IP, strconv.Itoa(p.RPCPort)) +// S3APIAddr returns the address of the node's S3 API port. +func (p Node) S3APIAddr() string { + return net.JoinHostPort(p.IP, strconv.Itoa(p.S3APIPort)) +} + +// RemoteNode describes all information necessary to connect to a given garage +// node. +type RemoteNode struct { + Node + ID string } // RPCNodeAddr returns the full node address (e.g. "id@ip:port") of the garage @@ -33,9 +36,10 @@ func (p RemoteNode) RPCNodeAddr() string { return fmt.Sprintf("%s@%s", p.ID, p.RPCAddr()) } -// S3APIAddr returns the address of the node's S3 API port. -func (p RemoteNode) S3APIAddr() string { - return net.JoinHostPort(p.IP, strconv.Itoa(p.S3APIPort)) +// LocalNode describes the configuration of a local garage instance. +type LocalNode struct { + Node + AdminPort int } // AdminAddr returns the address of the node's S3 API port. diff --git a/tasks/bugs/validate-non-conflicting-ports-or-dirs.md b/tasks/bugs/validate-non-conflicting-ports-or-dirs.md new file mode 100644 index 0000000..95b27af --- /dev/null +++ b/tasks/bugs/validate-non-conflicting-ports-or-dirs.md @@ -0,0 +1,10 @@ +--- +type: task +--- + +NetworkConfig should validate that no ports which have been configured to be +used conflict with each other. In other words, no port used by a storage +allocation should be used twice within that allocation, or used by another +storage allocation. + +Same goes for directories being used. diff --git a/tasks/misc/storage-allocation-status.md b/tasks/misc/storage-allocation-status.md new file mode 100644 index 0000000..6620f7d --- /dev/null +++ b/tasks/misc/storage-allocation-status.md @@ -0,0 +1,10 @@ +--- +type: task +--- + +There should be a `storage allocation status` command, separate from the +`storage allocation list` command, which shows whether any allocations are +currently draining (among any other information which might be useful). + +Once implemented the "Contributing Storage" operator documentation should be +updated to recommend using this command during allocation removal.