From 2cdec586b241f1a841bb84e3554d3d62db202134 Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Thu, 31 Oct 2024 13:04:19 +0100 Subject: [PATCH] Implement removal of nodes from the garage layout --- go/daemon/network/bootstrap.go | 2 + go/daemon/network/garage.go | 35 ++++--- go/daemon/network/network.go | 57 +++++++---- go/daemon/network/network_it_test.go | 115 ++++++++++++++-------- go/daemon/network/network_it_util_test.go | 71 ++++++++++--- go/garage/admin_client.go | 17 +++- 6 files changed, 210 insertions(+), 87 deletions(-) diff --git a/go/daemon/network/bootstrap.go b/go/daemon/network/bootstrap.go index 20e3a72..fbbddf0 100644 --- a/go/daemon/network/bootstrap.go +++ b/go/daemon/network/bootstrap.go @@ -6,6 +6,7 @@ import ( "isle/daemon/daecommon" "isle/garage/garagesrv" "isle/jsonutil" + "maps" "os" "path/filepath" ) @@ -67,6 +68,7 @@ func coalesceNetworkConfigAndBootstrap( } } + hostBootstrap.Hosts = maps.Clone(hostBootstrap.Hosts) hostBootstrap.Hosts[host.Name] = host return hostBootstrap, nil diff --git a/go/daemon/network/garage.go b/go/daemon/network/garage.go index 68b9991..2af314e 100644 --- a/go/daemon/network/garage.go +++ b/go/daemon/network/garage.go @@ -61,15 +61,12 @@ func newGarageAdminClient( logger *mlog.Logger, networkConfig daecommon.NetworkConfig, adminToken string, - hostBootstrap bootstrap.Bootstrap, + host bootstrap.Host, ) *garage.AdminClient { - - thisHost := hostBootstrap.ThisHost() - return garage.NewAdminClient( garageAdminClientLogger(logger), net.JoinHostPort( - thisHost.IP().String(), + host.IP().String(), strconv.Itoa(networkConfig.Storage.Allocations[0].AdminPort), ), adminToken, @@ -81,24 +78,26 @@ func garageApplyLayout( logger *mlog.Logger, networkConfig daecommon.NetworkConfig, adminToken string, - hostBootstrap bootstrap.Bootstrap, + prevHost, currHost bootstrap.Host, ) error { var ( adminClient = newGarageAdminClient( - logger, networkConfig, adminToken, hostBootstrap, + logger, networkConfig, adminToken, currHost, ) - thisHost = hostBootstrap.ThisHost() - hostName = thisHost.Name + hostName = currHost.Name allocs = networkConfig.Storage.Allocations peers = make([]garage.PeerLayout, len(allocs)) + peerIDs = map[string]struct{}{} + + idsToRemove = make([]string, 0, len(prevHost.Garage.Instances)) ) defer adminClient.Close() for i, alloc := range allocs { - - id := daecommon.BootstrapGarageHostForAlloc(thisHost, alloc).ID + id := daecommon.BootstrapGarageHostForAlloc(currHost, alloc).ID + peerIDs[id] = struct{}{} zone := string(hostName) if alloc.Zone != "" { @@ -113,7 +112,13 @@ func garageApplyLayout( } } - return adminClient.ApplyLayout(ctx, peers) + for _, prevInst := range prevHost.Garage.Instances { + if _, ok := peerIDs[prevInst.ID]; !ok { + idsToRemove = append(idsToRemove, prevInst.ID) + } + } + + return adminClient.ApplyLayout(ctx, peers, idsToRemove) } func garageInitializeGlobalBucket( @@ -121,13 +126,11 @@ func garageInitializeGlobalBucket( logger *mlog.Logger, networkConfig daecommon.NetworkConfig, adminToken string, - hostBootstrap bootstrap.Bootstrap, + host bootstrap.Host, ) ( garage.S3APICredentials, error, ) { - adminClient := newGarageAdminClient( - logger, networkConfig, adminToken, hostBootstrap, - ) + adminClient := newGarageAdminClient(logger, networkConfig, adminToken, host) defer adminClient.Close() creds, err := adminClient.CreateS3APICredentials( diff --git a/go/daemon/network/network.go b/go/daemon/network/network.go index af5c386..1b1dc64 100644 --- a/go/daemon/network/network.go +++ b/go/daemon/network/network.go @@ -335,7 +335,7 @@ func Create( stateDir toolkit.Dir, runtimeDir toolkit.Dir, creationParams bootstrap.CreationParams, - ipNet nebula.IPNet, // TODO should this go in CreationParams? + ipNet nebula.IPNet, hostName nebula.HostName, opts *Opts, ) ( @@ -408,14 +408,16 @@ func (n *network) initializeDirs(mayExist bool) error { } func (n *network) initialize( - ctx context.Context, currBootstrap bootstrap.Bootstrap, + ctx context.Context, prevBootstrap bootstrap.Bootstrap, ) error { + prevThisHost := prevBootstrap.ThisHost() + // we update this Host's data using whatever configuration has been provided // by the daemon config. This way the network has the most up-to-date // possible bootstrap. This updated bootstrap will later get updated in // garage as a background task, so other hosts will see it as well. currBootstrap, err := coalesceNetworkConfigAndBootstrap( - n.networkConfig, currBootstrap, + n.networkConfig, prevBootstrap, ) if err != nil { return fmt.Errorf("combining configuration into bootstrap: %w", err) @@ -447,7 +449,7 @@ func (n *network) initialize( n.logger.Info(ctx, "Child processes created") - if err := n.postInit(ctx); err != nil { + if err := n.postInit(ctx, prevThisHost); err != nil { n.logger.Error(ctx, "Post-initialization failed, stopping child processes", err) n.children.Shutdown() return fmt.Errorf("performing post-initialization: %w", err) @@ -480,15 +482,22 @@ func (n *network) initialize( return nil } -func (n *network) postInit(ctx context.Context) error { - if len(n.networkConfig.Storage.Allocations) > 0 { +func (n *network) postInit( + ctx context.Context, prevThisHost bootstrap.Host, +) error { + n.l.RLock() + defer n.l.RUnlock() + + thisHost := n.currBootstrap.ThisHost() + + if len(prevThisHost.Garage.Instances)+len(thisHost.Garage.Instances) > 0 { n.logger.Info(ctx, "Applying garage layout") if err := garageApplyLayout( ctx, n.logger, n.networkConfig, n.opts.GarageAdminToken, - n.currBootstrap, + prevThisHost, thisHost, ); err != nil { return fmt.Errorf("applying garage layout: %w", err) } @@ -509,7 +518,7 @@ func (n *network) postInit(ctx context.Context) error { n.logger, n.networkConfig, n.opts.GarageAdminToken, - n.currBootstrap, + thisHost, ) if err != nil { return fmt.Errorf("initializing global bucket: %w", err) @@ -559,8 +568,7 @@ func (n *network) reloadHosts(ctx context.Context) error { thisHost := currBootstrap.ThisHost() newBootstrap.Hosts[thisHost.Name] = thisHost - err = n.reload(ctx, nil, &newBootstrap) - if err != nil { + if _, err = n.reload(ctx, nil, &newBootstrap); err != nil { return fmt.Errorf("reloading with new host data: %w", err) } @@ -586,14 +594,19 @@ func (n *network) reloadLoop(ctx context.Context) { } } +// returns the bootstrap prior to the reload being applied. func (n *network) reload( ctx context.Context, newNetworkConfig *daecommon.NetworkConfig, newBootstrap *bootstrap.Bootstrap, -) error { +) ( + bootstrap.Bootstrap, error, +) { n.l.Lock() defer n.l.Unlock() + prevBootstrap := n.currBootstrap + if newBootstrap != nil { n.currBootstrap = *newBootstrap } @@ -606,22 +619,28 @@ func (n *network) reload( if n.currBootstrap, err = coalesceNetworkConfigAndBootstrap( n.networkConfig, n.currBootstrap, ); err != nil { - return fmt.Errorf("combining configuration into bootstrap: %w", err) + return bootstrap.Bootstrap{}, fmt.Errorf( + "combining configuration into bootstrap: %w", err, + ) } n.logger.Info(ctx, "Writing updated bootstrap to state dir") err = writeBootstrapToStateDir(n.stateDir.Path, n.currBootstrap) if err != nil { - return fmt.Errorf("writing bootstrap to state dir: %w", err) + return bootstrap.Bootstrap{}, fmt.Errorf( + "writing bootstrap to state dir: %w", err, + ) } n.logger.Info(ctx, "Reloading child processes") err = n.children.Reload(ctx, n.networkConfig, n.currBootstrap) if err != nil { - return fmt.Errorf("reloading child processes: %w", err) + return bootstrap.Bootstrap{}, fmt.Errorf( + "reloading child processes: %w", err, + ) } - return nil + return prevBootstrap, nil } func withCurrBootstrap[Res any]( @@ -873,8 +892,7 @@ func (n *network) CreateHost( newBootstrap.Hosts = joiningBootstrap.Bootstrap.Hosts n.logger.Info(ctx, "Reloading local state with new host") - err = n.reload(ctx, nil, &newBootstrap) - if err != nil { + if _, err = n.reload(ctx, nil, &newBootstrap); err != nil { return JoiningBootstrap{}, fmt.Errorf("reloading child processes: %w", err) } @@ -921,11 +939,12 @@ func (n *network) GetConfig(context.Context) (daecommon.NetworkConfig, error) { func (n *network) SetConfig( ctx context.Context, config daecommon.NetworkConfig, ) error { - if err := n.reload(ctx, &config, nil); err != nil { + prevBootstrap, err := n.reload(ctx, &config, nil) + if err != nil { return fmt.Errorf("reloading config: %w", err) } - if err := n.postInit(ctx); err != nil { + if err := n.postInit(ctx, prevBootstrap.ThisHost()); err != nil { return fmt.Errorf("performing post-initialization: %w", err) } diff --git a/go/daemon/network/network_it_test.go b/go/daemon/network/network_it_test.go index 260a488..4875242 100644 --- a/go/daemon/network/network_it_test.go +++ b/go/daemon/network/network_it_test.go @@ -19,7 +19,9 @@ func TestCreate(t *testing.T) { gotCreationParams, err := LoadCreationParams(network.stateDir) assert.NoError(t, err) - assert.Equal(t, gotCreationParams, network.creationParams) + assert.Equal( + t, gotCreationParams, network.getBootstrap(t).NetworkCreationParams, + ) } func TestLoad(t *testing.T) { @@ -37,7 +39,7 @@ func TestLoad(t *testing.T) { loadedNetwork, err := Load( h.ctx, h.logger.WithNamespace("loadedNetwork"), - network.networkConfig, + network.getConfig(t), getEnvBinDirPath(), network.stateDir, h.mkDir(t, "runtime"), @@ -76,18 +78,34 @@ func TestNetwork_GetConfig(t *testing.T) { config, err := network.GetConfig(h.ctx) assert.NoError(t, err) - assert.Equal(t, config, network.networkConfig) + assert.Equal(t, config, network.getConfig(t)) } func TestNetwork_SetConfig(t *testing.T) { - t.Run("adding storage alloc", func(t *testing.T) { + allocsToPeerLayouts := func( + hostName nebula.HostName, allocs []bootstrap.GarageHostInstance, + ) []garage.PeerLayout { + peers := make([]garage.PeerLayout, len(allocs)) + for i := range allocs { + peers[i] = garage.PeerLayout{ + ID: allocs[i].ID, + Capacity: 1_000_000_000, + Zone: string(hostName), + Tags: []string{}, + } + } + return peers + } + + t.Run("add storage alloc", func(t *testing.T) { var ( - h = newIntegrationHarness(t) - network = h.createNetwork(t, "primus", nil) + h = newIntegrationHarness(t) + network = h.createNetwork(t, "primus", nil) + networkConfig = network.getConfig(t) ) - network.networkConfig.Storage.Allocations = append( - network.networkConfig.Storage.Allocations, + networkConfig.Storage.Allocations = append( + networkConfig.Storage.Allocations, daecommon.ConfigStorageAllocation{ DataPath: h.mkDir(t, "data").Path, MetaPath: h.mkDir(t, "meta").Path, @@ -98,17 +116,10 @@ func TestNetwork_SetConfig(t *testing.T) { }, ) - assert.NoError(t, network.SetConfig(h.ctx, network.networkConfig)) - - // Check that the Host information was updated - newHosts, err := network.GetHosts(h.ctx) - assert.NoError(t, err) - - newHostsByName := map[nebula.HostName]bootstrap.Host{} - for _, h := range newHosts { - newHostsByName[h.Name] = h - } + assert.NoError(t, network.SetConfig(h.ctx, networkConfig)) + t.Log("Checking that the Host information was updated") + newHostsByName := network.getHostsByName(t) newHost, ok := newHostsByName[network.hostName] assert.True(t, ok) @@ -123,34 +134,60 @@ func TestNetwork_SetConfig(t *testing.T) { RPCPort: 4901, }, newAlloc) - // Check that the bootstrap file was written with the new host config + t.Log("Checking that the bootstrap file was written with the new host config") var storedBootstrap bootstrap.Bootstrap assert.NoError(t, jsonutil.LoadFile( &storedBootstrap, bootstrap.StateDirPath(network.stateDir.Path), )) assert.Equal(t, newHostsByName, storedBootstrap.Hosts) - // Check that garage layout contains the new allocation - garageAdminClient := newGarageAdminClient( - h.logger, - network.networkConfig, - network.opts.GarageAdminToken, - storedBootstrap, - ) - - layout, err := garageAdminClient.GetLayout(h.ctx) + t.Log("Checking that garage layout contains the new allocation") + expPeers := allocsToPeerLayouts(network.hostName, allocs) + layout, err := network.garageAdminClient(t).GetLayout(h.ctx) assert.NoError(t, err) - - expPeers := make([]garage.PeerLayout, len(allocs)) - for i := range allocs { - expPeers[i] = garage.PeerLayout{ - ID: allocs[i].ID, - Capacity: 1_000_000_000, - Zone: string(network.hostName), - Tags: []string{}, - } - } - assert.ElementsMatch(t, expPeers, layout.Peers) }) + + t.Run("remove storage alloc", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + network = h.createNetwork(t, "primus", &createNetworkOpts{ + numStorageAllocs: 4, + }) + networkConfig = network.getConfig(t) + + prevHost = network.getHostsByName(t)[network.hostName] + removedAlloc = networkConfig.Storage.Allocations[3] + removedGarageInst = daecommon.BootstrapGarageHostForAlloc( + prevHost, removedAlloc, + ) + ) + + networkConfig.Storage.Allocations = networkConfig.Storage.Allocations[:3] + assert.NoError(t, network.SetConfig(h.ctx, networkConfig)) + + t.Log("Checking that the Host information was updated") + newHostsByName := network.getHostsByName(t) + newHost, ok := newHostsByName[network.hostName] + assert.True(t, ok) + + allocs := newHost.HostConfigured.Garage.Instances + assert.Len(t, allocs, 3) + assert.NotContains(t, allocs, removedGarageInst) + + t.Log("Checking that the bootstrap file was written with the new host config") + var storedBootstrap bootstrap.Bootstrap + assert.NoError(t, jsonutil.LoadFile( + &storedBootstrap, bootstrap.StateDirPath(network.stateDir.Path), + )) + assert.Equal(t, newHostsByName, storedBootstrap.Hosts) + + t.Log("Checking that garage layout contains the new allocation") + expPeers := allocsToPeerLayouts(network.hostName, allocs) + layout, err := network.garageAdminClient(t).GetLayout(h.ctx) + assert.NoError(t, err) + assert.ElementsMatch(t, expPeers, layout.Peers) + }) + + // TODO a host having allocs but removing all of them } diff --git a/go/daemon/network/network_it_util_test.go b/go/daemon/network/network_it_util_test.go index e7c724b..a540ba8 100644 --- a/go/daemon/network/network_it_util_test.go +++ b/go/daemon/network/network_it_util_test.go @@ -6,6 +6,7 @@ import ( "fmt" "isle/bootstrap" "isle/daemon/daecommon" + "isle/garage" "isle/nebula" "isle/toolkit" "os" @@ -167,8 +168,9 @@ func (h *integrationHarness) mkNetworkConfig( } type createNetworkOpts struct { - creationParams bootstrap.CreationParams - manualShutdown bool + creationParams bootstrap.CreationParams + manualShutdown bool + numStorageAllocs int } func (o *createNetworkOpts) withDefaults() *createNetworkOpts { @@ -180,15 +182,19 @@ func (o *createNetworkOpts) withDefaults() *createNetworkOpts { o.creationParams = bootstrap.NewCreationParams("test", "test.localnet") } + if o.numStorageAllocs == 0 { + o.numStorageAllocs = 3 + } + return o } type integrationHarnessNetwork struct { Network + ctx context.Context + logger *mlog.Logger hostName nebula.HostName - creationParams bootstrap.CreationParams - networkConfig daecommon.NetworkConfig stateDir, runtimeDir toolkit.Dir opts *Opts } @@ -202,9 +208,10 @@ func (h *integrationHarness) createNetwork( opts = opts.withDefaults() var ( + logger = h.logger.WithNamespace("network").WithNamespace(hostNameStr) networkConfig = h.mkNetworkConfig(t, &networkConfigOpts{ hasPublicAddr: true, - numStorageAllocs: 3, + numStorageAllocs: opts.numStorageAllocs, }) stateDir = h.mkDir(t, "state") @@ -220,7 +227,7 @@ func (h *integrationHarness) createNetwork( network, err := Create( h.ctx, - h.logger.WithNamespace("network").WithNamespace(hostNameStr), + logger, networkConfig, getEnvBinDirPath(), stateDir, @@ -245,9 +252,9 @@ func (h *integrationHarness) createNetwork( return integrationHarnessNetwork{ network, + h.ctx, + logger, hostName, - opts.creationParams, - networkConfig, stateDir, runtimeDir, networkOpts, @@ -288,6 +295,7 @@ func (h *integrationHarness) joinNetwork( } var ( + logger = h.logger.WithNamespace("network").WithNamespace(hostNameStr) networkConfig = h.mkNetworkConfig(t, opts.networkConfigOpts) stateDir = h.mkDir(t, "state") runtimeDir = h.mkDir(t, "runtime") @@ -299,7 +307,7 @@ func (h *integrationHarness) joinNetwork( t.Logf("Joining as %q", hostNameStr) joinedNetwork, err := Join( h.ctx, - h.logger.WithNamespace("network").WithNamespace(hostNameStr), + logger, networkConfig, joiningBootstrap, getEnvBinDirPath(), @@ -322,11 +330,52 @@ func (h *integrationHarness) joinNetwork( return integrationHarnessNetwork{ joinedNetwork, + h.ctx, + logger, hostName, - network.creationParams, - networkConfig, stateDir, runtimeDir, networkOpts, } } + +func (nh *integrationHarnessNetwork) getConfig(t *testing.T) daecommon.NetworkConfig { + networkConfig, err := nh.Network.GetConfig(nh.ctx) + require.NoError(t, err) + return networkConfig +} + +func (nh *integrationHarnessNetwork) getBootstrap( + t *testing.T, +) bootstrap.Bootstrap { + currBootstrap, err := nh.Network.(*network).getBootstrap() + require.NoError(t, err) + return currBootstrap +} + +func (nh *integrationHarnessNetwork) garageAdminClient( + t *testing.T, +) *garage.AdminClient { + c := newGarageAdminClient( + nh.logger, + nh.getConfig(t), + nh.opts.GarageAdminToken, + nh.getBootstrap(t).ThisHost(), + ) + t.Cleanup(func() { assert.NoError(t, c.Close()) }) + return c +} + +func (nh *integrationHarnessNetwork) getHostsByName( + t *testing.T, +) map[nebula.HostName]bootstrap.Host { + hosts, err := nh.Network.GetHosts(nh.ctx) + require.NoError(t, err) + + hostsByName := map[nebula.HostName]bootstrap.Host{} + for _, h := range hosts { + hostsByName[h.Name] = h + } + + return hostsByName +} diff --git a/go/garage/admin_client.go b/go/garage/admin_client.go index 16ea4a2..148a8b2 100644 --- a/go/garage/admin_client.go +++ b/go/garage/admin_client.go @@ -293,10 +293,23 @@ func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) { } // ApplyLayout modifies the layout of the garage cluster. Only layout of the -// given peers will be modified/created, other peers are not affected. +// given peers will be modified/created/removed, other peers are not affected. func (c *AdminClient) ApplyLayout( - ctx context.Context, peers []PeerLayout, + ctx context.Context, addModifyPeers []PeerLayout, removePeerIDs []string, ) error { + type removePeer struct { + ID string `json:"id"` + Remove bool `json:"remove"` + } + + peers := make([]any, 0, len(addModifyPeers)+len(removePeerIDs)) + for _, p := range addModifyPeers { + peers = append(peers, p) + } + for _, id := range removePeerIDs { + peers = append(peers, removePeer{ID: id, Remove: true}) + } + { // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout err := c.do(ctx, nil, "POST", "/v1/layout", peers)