From 9e508ef4e25da749c9e0db049258708a3b7fdeb7 Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Thu, 2 Jan 2025 14:08:24 +0100 Subject: [PATCH] Get rid of orphan removal after investigation into race conditions, new solution pending --- go/daemon/network/garage.go | 60 +------ go/daemon/network/network.go | 78 +--------- go/daemon/network/network_it_test.go | 70 ++++++++- go/daemon/network/network_it_util_test.go | 57 +++++++ go/toolkit/testutils_blocker.go | 147 ++++++++++++++++++ ...e-apply-layout-before-stopping-instance.md | 14 -- tasks/bugs/garage-layout-management.md | 84 ++++++++++ .../garage-orphan-remover-race-condition.md | 8 - tasks/bugs/garage-remove-then-re-add-alloc.md | 14 -- ...et-config-dont-commit-new-config-on-err.md | 2 +- 10 files changed, 358 insertions(+), 176 deletions(-) create mode 100644 go/toolkit/testutils_blocker.go delete mode 100644 tasks/bugs/garage-apply-layout-before-stopping-instance.md create mode 100644 tasks/bugs/garage-layout-management.md delete mode 100644 tasks/bugs/garage-orphan-remover-race-condition.md delete mode 100644 tasks/bugs/garage-remove-then-re-add-alloc.md diff --git a/go/daemon/network/garage.go b/go/daemon/network/garage.go index 0456764..ae51fa2 100644 --- a/go/daemon/network/garage.go +++ b/go/daemon/network/garage.go @@ -13,16 +13,13 @@ import ( "isle/secrets" "isle/toolkit" "net" - "net/netip" "path/filepath" - "slices" "strconv" "time" "dev.mediocregopher.com/mediocre-go-lib.git/mctx" "dev.mediocregopher.com/mediocre-go-lib.git/mlog" "github.com/minio/minio-go/v7" - "golang.org/x/exp/maps" ) // Paths within garage's global bucket. @@ -218,6 +215,7 @@ func getGarageBootstrapHosts( host, err := authedHost.Unwrap(currBootstrap.CAPublicCredentials) if err != nil { logger.Warn(ctx, "Host could not be authenticated", err) + continue } hosts[host.Name] = host @@ -363,59 +361,3 @@ func garageWaitForAlloc( return nil } - -// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on -// the given garage cluster status. It will return zero values if the host has -// no buddy. -// -// For situations where we want one host to affect the cluster layout of another -// host's peers, we use a simple system to determine a single host which is -// responsible. The goal is not to be 100% race-proof (garage handles that), but -// rather to try to prevent all hosts from modifying the same host's layout at -// the same time. -// -// The system is to order all hosts by their IP, and say that each host is -// responsible for (aka the "buddy" of) the host immediately after their own in -// that list. The last host in that list is responsible for the first. -func garageNodeBuddyPeers( - status garage.ClusterStatus, host bootstrap.Host, -) ( - netip.Addr, []garage.Role, -) { - var ( - thisIP = host.IP() - nodeRolesByIP = map[netip.Addr][]garage.Role{} - ) - - for _, node := range status.Nodes { - if node.Role == nil { - continue - } - - ip := node.Addr.Addr() - nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role) - } - - // If there is only a single host in the cluster (or, somehow, none) then - // that host has no buddy. - if len(nodeRolesByIP) < 2 { - return netip.Addr{}, nil - } - - nodeIPs := maps.Keys(nodeRolesByIP) - slices.SortFunc(nodeIPs, netip.Addr.Compare) - - for i, nodeIP := range nodeIPs { - var buddyIP netip.Addr - if i == len(nodeIPs)-1 { - buddyIP = nodeIPs[0] - } else if nodeIP == thisIP { - buddyIP = nodeIPs[i+1] - } else { - continue - } - return buddyIP, nodeRolesByIP[buddyIP] - } - - panic("Unreachable") -} diff --git a/go/daemon/network/network.go b/go/daemon/network/network.go index 3a84107..ec95c83 100644 --- a/go/daemon/network/network.go +++ b/go/daemon/network/network.go @@ -160,6 +160,9 @@ type Opts struct { // used, either that which it was most recently initialized with or which // was passed to [SetConfig]. Config *daecommon.NetworkConfig + + // testBlocker is used by tests to set blockpoints. + testBlocker *toolkit.TestBlocker } func (o *Opts) withDefaults() *Opts { @@ -508,10 +511,6 @@ func (n *network) initialize( n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute) - n.periodically( - "removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute, - ) - return nil } @@ -603,77 +602,6 @@ func (n *network) reloadHosts(ctx context.Context) error { return nil } -// In general each host will manage the garage cluster layout of its own storage -// allocations via garageApplyLayout. There are three situations which are -// handled here, rather than garageApplyLayout: -// -// - A host removes all of its allocations via SetConfig. -// - A host removes all of its allocations by calling Load with no allocations -// in the provided daecommon.NetworkConfig. -// - A host is removed from the network by another host. -// -// In all of these cases the host no longer has any garage instances running, -// and so can't call garageApplyLayout on itself. To combat this we have all -// hosts which do have garage instances running periodically check that there's -// not some garage nodes orphaned in the cluster layout, and remove them if so. -func (n *network) removeOrphanGarageNodes(ctx context.Context) error { - n.l.RLock() - defer n.l.RUnlock() - - thisHost := n.currBootstrap.ThisHost() - if len(thisHost.Garage.Instances) == 0 { - n.logger.Info(ctx, "No local garage instances, cannot remove orphans") - return nil - } - - adminClient := newGarageAdminClient( - n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost, - ) - defer adminClient.Close() - - clusterStatus, err := adminClient.Status(ctx) - if err != nil { - return fmt.Errorf("retrieving garage cluster status: %w", err) - } - - buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost) - if len(buddyNodes) == 0 { - return nil - } - - ctx = mctx.Annotate(ctx, "buddyIP", buddyIP) - - for _, host := range n.currBootstrap.Hosts { - if host.IP() != buddyIP { - continue - } else if len(host.Garage.Instances) > 0 { - n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing") - return nil - } - break - } - - // Either the host is no longer in the network, or it no longer has any - // garage instances set on it. Either way, remove its nodes from the cluster - // layout. - - buddyNodeIDs := make([]string, len(buddyNodes)) - for i, buddyNode := range buddyNodes { - buddyNodeIDs[i] = buddyNode.ID - } - - n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes") - if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil { - return fmt.Errorf( - "applying garage cluster layout, removing nodes %+v: %w", - buddyNodes, - err, - ) - } - - return nil -} - // returns the bootstrap prior to the reload being applied. func (n *network) reload( ctx context.Context, diff --git a/go/daemon/network/network_it_test.go b/go/daemon/network/network_it_test.go index 4a5a3d1..9d0bc0d 100644 --- a/go/daemon/network/network_it_test.go +++ b/go/daemon/network/network_it_test.go @@ -1,15 +1,18 @@ package network import ( + "fmt" "isle/bootstrap" "isle/daemon/daecommon" "isle/garage" "isle/garage/garagesrv" "isle/jsonutil" "isle/nebula" + "isle/toolkit" "os" "path/filepath" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -71,7 +74,7 @@ func TestJoin(t *testing.T) { assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t)) }) - t.Run("with alloc", func(t *testing.T) { + t.Run("with alloc/simple", func(t *testing.T) { var ( h = newIntegrationHarness(t) primus = h.createNetwork(t, "primus", nil) @@ -86,6 +89,61 @@ func TestJoin(t *testing.T) { assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx)) assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t)) + assertGarageLayout(t, map[*integrationHarnessNetwork]int{ + primus: 3, + secondus: 1, + }) + }) + + // Assert that if primus runs the orphan remover at the same moment that + // secondus is joining that the layout applied by secondus doesn't get + // overwritten. + t.Run("with alloc/remove orphans after garage layout applied", func(t *testing.T) { + t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly") + + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + primusAdminClient = primus.garageAdminClient(t) + secondusBlocker = toolkit.NewTestBlocker(t) + ) + + secondusBlocker.ExpectBlockpoint("garageLayoutApplied").On( + t, h.ctx, func() { + h.logger.Info(h.ctx, "Waiting for new layout to propagate to primus") + err := toolkit.UntilTrue( + h.ctx, h.logger, 1*time.Second, func() (bool, error) { + layout, err := primusAdminClient.GetLayout(h.ctx) + if err != nil { + return false, fmt.Errorf("getting layout: %w", err) + } + + return len(layout.Roles) == 4, nil + }, + ) + + if !assert.NoError(t, err) { + return + } + + //h.logger.Info(h.ctx, "Calling removeOrphanGarageNodes") + //assert.NoError( + // t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx), + //) + }, + ) + + secondus := h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ + networkConfigOpts: &networkConfigOpts{ + numStorageAllocs: 1, + }, + blocker: secondusBlocker, + }) + + assertGarageLayout(t, map[*integrationHarnessNetwork]int{ + primus: 3, + secondus: 1, + }) }) } @@ -285,6 +343,8 @@ func TestNetwork_SetConfig(t *testing.T) { }) t.Run("remove all storage allocs", func(t *testing.T) { + t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly") + var ( h = newIntegrationHarness(t) primus = h.createNetwork(t, "primus", nil) @@ -319,10 +379,10 @@ func TestNetwork_SetConfig(t *testing.T) { assert.NoError(t, err) assert.Contains(t, layout.Roles, removedRole) - t.Log("Removing orphan garage nodes with primus") - assert.NoError( - t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx), - ) + //t.Log("Removing orphan garage nodes with primus") + //assert.NoError( + // t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx), + //) t.Log("Checking that garage layout no longer contains the old allocation") layout, err = primusGarageAdminClient.GetLayout(h.ctx) diff --git a/go/daemon/network/network_it_util_test.go b/go/daemon/network/network_it_util_test.go index 45a2f2d..8f3222f 100644 --- a/go/daemon/network/network_it_util_test.go +++ b/go/daemon/network/network_it_util_test.go @@ -1,6 +1,7 @@ package network import ( + "cmp" "context" "fmt" "isle/bootstrap" @@ -11,6 +12,7 @@ import ( "isle/toolkit" "os" "path/filepath" + "slices" "sync" "sync/atomic" "testing" @@ -251,6 +253,7 @@ type joinNetworkOpts struct { *networkConfigOpts canCreateHosts bool manualShutdown bool + blocker *toolkit.TestBlocker } func (o *joinNetworkOpts) withDefaults() *joinNetworkOpts { @@ -288,6 +291,7 @@ func (h *integrationHarness) joinNetwork( networkOpts = &Opts{ GarageAdminToken: "admin_token", Config: &networkConfig, + testBlocker: opts.blocker, } ) @@ -382,3 +386,56 @@ func (nh *integrationHarnessNetwork) getHostsByName( require.NoError(t, err) return currBootstrap.Hosts } + +func assertGarageLayout( + t *testing.T, + wantLayout map[*integrationHarnessNetwork]int, // network -> num allocs +) { + wantLayoutSimple := map[string]int{} + for nh, wantAllocs := range wantLayout { + wantLayoutSimple[string(nh.hostName)] = wantAllocs + } + + normalizeLayout := func(layout *garage.ClusterLayout) { + slices.SortFunc(layout.Roles, func(a, b garage.Role) int { + return cmp.Compare(a.ID, b.ID) + }) + } + + assertSingle := func( + nh *integrationHarnessNetwork, layout garage.ClusterLayout, + ) { + gotLayoutSimple := map[string]int{} + for _, role := range layout.Roles { + gotLayoutSimple[role.Zone]++ + } + assert.Equal(t, wantLayoutSimple, gotLayoutSimple, "layout from %q", nh.hostName) + } + + var ( + lastLayoutHostName nebula.HostName + lastLayout garage.ClusterLayout + ) + + for nh := range wantLayout { + layout, err := nh.garageAdminClient(t).GetLayout(nh.ctx) + assert.NoError(t, err) + + normalizeLayout(&layout) + assertSingle(nh, layout) + + if lastLayoutHostName != "" { + assert.Equal( + t, + lastLayout, + layout, + "layout of %q not equal to layout of %q", + lastLayoutHostName, + nh.hostName, + ) + } + + lastLayoutHostName = nh.hostName + lastLayout = layout + } +} diff --git a/go/toolkit/testutils_blocker.go b/go/toolkit/testutils_blocker.go new file mode 100644 index 0000000..358cb03 --- /dev/null +++ b/go/toolkit/testutils_blocker.go @@ -0,0 +1,147 @@ +package toolkit + +import ( + "context" + "sync" + "testing" + + "github.com/stretchr/testify/assert" +) + +// ExpectedBlockpoint represents the expectation that Blockpoint will be called +// on a TestBlocker. It is possible to both wait for the Blockpoint call to +// occur and to unblock it once it has occured. +type ExpectedBlockpoint struct { + waitCh chan struct{} + unblockCh chan struct{} +} + +// Wait will block until blockpoint has been hit and is itself blocking, or will +// return the context error. +func (eb ExpectedBlockpoint) Wait(ctx context.Context) error { + select { + case <-ctx.Done(): + return ctx.Err() + case <-eb.waitCh: + return nil + } +} + +// Unblock unblocks the Blockpoint call which is/was expected. If Unblock can be +// called prior to Wait being called (and therefore prior to the Blockpoint +// being hit). +func (eb ExpectedBlockpoint) Unblock() { + close(eb.unblockCh) +} + +// On is a helper which will spawn a go-routine, call Wait on the +// ExpectedBlockpoint, call the given callback, and then Unblock the +// ExpectedBlockpoint. +// +// If Wait returns an error (due to context cancellation) then this fails the +// test and returns without calling the callback. +func (eb ExpectedBlockpoint) On(t *testing.T, ctx context.Context, cb func()) { + go func() { + defer eb.Unblock() + if !assert.NoError(t, eb.Wait(ctx)) { + return + } + cb() + }() +} + +// TestBlocker is used as an injected dependency into components, so that tests +// can cause those components to block at specific execution points internally. +// This is useful for testing race conditions between multiple components. +// +// A TestBlocker is initialized using `new`. A nil TestBlocker will never block. +type TestBlocker struct { + l sync.Mutex + blockpointsByID map[string][]ExpectedBlockpoint + blocksByID map[string]int +} + +// NewTestBlocker initializes a TestBlocker and registers a Cleanup callback on +// the T which will call AssertExpectations. +func NewTestBlocker(t *testing.T) *TestBlocker { + b := new(TestBlocker) + t.Cleanup(func() { b.AssertExpectations(t) }) + return b +} + +// Blockpoint will block if and only if TestBlocker is non-nil and +// ExpectBlockpoint has been called with the same ID previously. If the context +// is canceled while blocking then this call will return. +func (b *TestBlocker) Blockpoint(ctx context.Context, id string) { + if b == nil { + return + } + + b.l.Lock() + + blockpoints := b.blockpointsByID[id] + if len(blockpoints) == 0 { + b.l.Unlock() + return + } + + blockpoint, blockpoints := blockpoints[0], blockpoints[1:] + b.blockpointsByID[id] = blockpoints + b.blocksByID[id]++ + + b.l.Unlock() + + close(blockpoint.waitCh) + + select { + case <-ctx.Done(): + case <-blockpoint.unblockCh: + } +} + +// ExpectBlockpoint will cause the TestBlocker to block upon the next call to +// Blockpoint using the same id. The returned ExpectBlockpoint can be used to +// wait until Blockpoint is called, as well as to unblock it. +func (b *TestBlocker) ExpectBlockpoint(id string) ExpectedBlockpoint { + b.l.Lock() + defer b.l.Unlock() + + if b.blockpointsByID == nil { + b.blockpointsByID = map[string][]ExpectedBlockpoint{} + } + + if b.blocksByID == nil { + b.blocksByID = map[string]int{} + } + + blockpoint := ExpectedBlockpoint{ + waitCh: make(chan struct{}), + unblockCh: make(chan struct{}), + } + + b.blockpointsByID[id] = append(b.blockpointsByID[id], blockpoint) + + return blockpoint +} + +// AssertExpectations will Fail the test and return false if any calls to +// ExpectBlockpoint have not had a corresponding Blockpoint call made. +func (b *TestBlocker) AssertExpectations(t *testing.T) bool { + b.l.Lock() + defer b.l.Unlock() + + var failed bool + for id, blockpoints := range b.blockpointsByID { + if len(blockpoints) == 0 { + continue + } + + failed = true + t.Errorf( + "Blockpoint(%q) called %d times, expected %d more", + id, b.blocksByID[id], len(blockpoints), + ) + } + + return !failed +} diff --git a/tasks/bugs/garage-apply-layout-before-stopping-instance.md b/tasks/bugs/garage-apply-layout-before-stopping-instance.md deleted file mode 100644 index dc4fb34..0000000 --- a/tasks/bugs/garage-apply-layout-before-stopping-instance.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -type: task ---- - -When removing a storage allocation the new layout should be applied _before_ the -impacted garage instance is shut down. Isle should ideally also wait for the -impacted instance to no longer be in the "draining" state prior to shutting it -down, if possible. - -Some care needs to be taken in the case of the `daemon.yml` file being used for -configuration. The daemon should probably initially load using the old -configuration, and only then apply the new configuration as if it were applied -using `SetConfig`. This way the garage instance being removed can be brought -back up, drained, then shut down again. diff --git a/tasks/bugs/garage-layout-management.md b/tasks/bugs/garage-layout-management.md new file mode 100644 index 0000000..bf4d500 --- /dev/null +++ b/tasks/bugs/garage-layout-management.md @@ -0,0 +1,84 @@ +--- +type: task +--- + +## Problem + +There are high-level but extremely problematic issues with how garage layout +management is being done. + +In general the strategy around layout management is that each host only modifies +the cluster layout related to itself, and never touches applied roles of other +hosts. This works great for all except one case: a host removing one or more of +its allocations. + +There are two separate issues which must be dealt with, each related partially +to the other. + +## Draining of garage data + +When a garage node is removed from the cluster it first goes into the "draining" +state, so that other nodes in the cluster can ensure that the replication factor +for each piece of data is met prior to the node being decommissioned. + +While the node is in draining state it cannot be used for S3 API calls, as the +bucket credentials are no longer present on it. + +## Configuration change on restart + +For hosts whose configuration is managed by `daemon.yml` it is not necessarily +known that a garage node used to exist at all upon restart. The host can't +investigate the cluster layout because it won't have a garage instance running, +and even if it could it wouldn't be able to bring up a garage node to properly +drain the old allocations. + +# Invalid Solutions + +One solution which is tempting but ultimately NOT viable is to make all hosts +run at least one garage instance, and if they have no storage allocations to +make that instance be a "gateway" instance. This is won't work though, because +it would require all hosts to open up the RPC port on their firewall, and +firewall management requires extra user involvement. + +Another previous solution was to use an "orphan remover" process on each host, +where the host would compare the garage cluster layout to the expected layout +based on the bootstrap data in the common bucket, and remove any hosts from the +layout which shouldn't be there and don't have a garage instance to remove +themselves with. This had a bunch of unresolveable race conditions, and it +didn't account for draining besides. + +# Possible Solution + +The solution seems to be that the host must maintain two views of its garage +allocations: the last known allocation state, and the desired allocation state. + +The last known state needs to contain both what state the allocation was in +(healthy or draining), along with its directories and capacity. This should get +updated anytime the host performs an action which changes it (modifying the +cluster layout to add a new instance or move an existing one to draining, or +actually removing an instance which is done draining). + +The desired state is essentially the network configuration as it is now. This +will be used along with the last known state to take actions. + +There are a few details to note with this solution: + +- There will need to be a worker which periodically checks the last known state + for any nodes which were draining, and if they are done draining then remove + them. + +- When the host starts up it should _always_ use the last known state, and only + once started up should it go to apply the desired configuration. + +- When choosing an admin endpoint to use the last known state should be used, + even though it might result in unexpected behavior from the user's perspective + (since the user only knows about the desired state). This applies for RPC + endpoints as well. + +- The last/desired states need to be checked for conflicts, and an error emitted + in the event that there is one (either returned from SetConfig or Load). This + includes a new allocation using the same directory as an old one (based on RPC + port), or two allocations using the same RPC port. + +- The nebula firewall must base its opened ports on the last known state rather + than desired state. diff --git a/tasks/bugs/garage-orphan-remover-race-condition.md b/tasks/bugs/garage-orphan-remover-race-condition.md deleted file mode 100644 index 7322dc2..0000000 --- a/tasks/bugs/garage-orphan-remover-race-condition.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -type: task ---- - -There is a race condition in the garage orphan removal worker. Basically when -host A is adding its first allocation, it's possible that it hasn't yet updated -the host info in the common bucket, and so host B removes it from the layout -prior to that host info being updated. diff --git a/tasks/bugs/garage-remove-then-re-add-alloc.md b/tasks/bugs/garage-remove-then-re-add-alloc.md deleted file mode 100644 index 9fca654..0000000 --- a/tasks/bugs/garage-remove-then-re-add-alloc.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -type: task -after: - - ./garage-apply-layout-before-stopping-instance.md ---- - -I think there is currently a bug related to re-adding a storage allocation on an -RPC port of a garage instance which was previously used: - -- Step 1) Remove a storage allocation using RPC port N -- Step 2) Add a new allocation using RPC port N, but with a new data/meta dir - -I believe in this case garage will go back to using the old data/meta dir, and -possibly even re-use the old pubkey. diff --git a/tasks/bugs/set-config-dont-commit-new-config-on-err.md b/tasks/bugs/set-config-dont-commit-new-config-on-err.md index d07d587..b3c5839 100644 --- a/tasks/bugs/set-config-dont-commit-new-config-on-err.md +++ b/tasks/bugs/set-config-dont-commit-new-config-on-err.md @@ -1,7 +1,7 @@ --- type: task after: - - ./garage-apply-layout-before-stopping-instance.md + - ./garage-layout-management.md --- When SetConfig is called, but ends up erroring, the new config should not end up