Get rid of orphan removal after investigation into race conditions, new solution pending

2025-01-02 14:08:24 +01:00 · 2025-01-02 14:08:24 +01:00 · 9e508ef4e2
commit 9e508ef4e2
parent e3d4fc5a8e
10 changed files with 358 additions and 176 deletions
--- a/go/daemon/network/garage.go
+++ b/go/daemon/network/garage.go
@ -13,16 +13,13 @@ import (
 	"isle/secrets"
 	"isle/toolkit"
 	"net"
-	"net/netip"
 	"path/filepath"
-	"slices"
 	"strconv"
 	"time"

 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
 	"github.com/minio/minio-go/v7"
-	"golang.org/x/exp/maps"
 )

 // Paths within garage's global bucket.
@ -218,6 +215,7 @@ func getGarageBootstrapHosts(
 		host, err := authedHost.Unwrap(currBootstrap.CAPublicCredentials)
 		if err != nil {
 			logger.Warn(ctx, "Host could not be authenticated", err)
+			continue
 		}

 		hosts[host.Name] = host
@ -363,59 +361,3 @@ func garageWaitForAlloc(

 	return nil
 }
-
-// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
-// the given garage cluster status. It will return zero values if the host has
-// no buddy.
-//
-// For situations where we want one host to affect the cluster layout of another
-// host's peers, we use a simple system to determine a single host which is
-// responsible. The goal is not to be 100% race-proof (garage handles that), but
-// rather to try to prevent all hosts from modifying the same host's layout at
-// the same time.
-//
-// The system is to order all hosts by their IP, and say that each host is
-// responsible for (aka the "buddy" of) the host immediately after their own in
-// that list. The last host in that list is responsible for the first.
-func garageNodeBuddyPeers(
-	status garage.ClusterStatus, host bootstrap.Host,
-) (
-	netip.Addr, []garage.Role,
-) {
-	var (
-		thisIP        = host.IP()
-		nodeRolesByIP = map[netip.Addr][]garage.Role{}
-	)
-
-	for _, node := range status.Nodes {
-		if node.Role == nil {
-			continue
-		}
-
-		ip := node.Addr.Addr()
-		nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role)
-	}
-
-	// If there is only a single host in the cluster (or, somehow, none) then
-	// that host has no buddy.
-	if len(nodeRolesByIP) < 2 {
-		return netip.Addr{}, nil
-	}
-
-	nodeIPs := maps.Keys(nodeRolesByIP)
-	slices.SortFunc(nodeIPs, netip.Addr.Compare)
-
-	for i, nodeIP := range nodeIPs {
-		var buddyIP netip.Addr
-		if i == len(nodeIPs)-1 {
-			buddyIP = nodeIPs[0]
-		} else if nodeIP == thisIP {
-			buddyIP = nodeIPs[i+1]
-		} else {
-			continue
-		}
-		return buddyIP, nodeRolesByIP[buddyIP]
-	}
-
-	panic("Unreachable")
-}
--- a/go/daemon/network/network.go
+++ b/go/daemon/network/network.go
@ -160,6 +160,9 @@ type Opts struct {
 	// used, either that which it was most recently initialized with or which
 	// was passed to [SetConfig].
 	Config *daecommon.NetworkConfig
+
+	// testBlocker is used by tests to set blockpoints.
+	testBlocker *toolkit.TestBlocker
 }

 func (o *Opts) withDefaults() *Opts {
@ -508,10 +511,6 @@ func (n *network) initialize(

 	n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)

-	n.periodically(
-		"removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
-	)
-
 	return nil
 }

@ -603,77 +602,6 @@ func (n *network) reloadHosts(ctx context.Context) error {
 	return nil
 }

-// In general each host will manage the garage cluster layout of its own storage
-// allocations via garageApplyLayout. There are three situations which are
-// handled here, rather than garageApplyLayout:
-//
-//   - A host removes all of its allocations via SetConfig.
-//   - A host removes all of its allocations by calling Load with no allocations
-//     in the provided daecommon.NetworkConfig.
-//   - A host is removed from the network by another host.
-//
-// In all of these cases the host no longer has any garage instances running,
-// and so can't call garageApplyLayout on itself. To combat this we have all
-// hosts which do have garage instances running periodically check that there's
-// not some garage nodes orphaned in the cluster layout, and remove them if so.
-func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
-	n.l.RLock()
-	defer n.l.RUnlock()
-
-	thisHost := n.currBootstrap.ThisHost()
-	if len(thisHost.Garage.Instances) == 0 {
-		n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
-		return nil
-	}
-
-	adminClient := newGarageAdminClient(
-		n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
-	)
-	defer adminClient.Close()
-
-	clusterStatus, err := adminClient.Status(ctx)
-	if err != nil {
-		return fmt.Errorf("retrieving garage cluster status: %w", err)
-	}
-
-	buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
-	if len(buddyNodes) == 0 {
-		return nil
-	}
-
-	ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
-
-	for _, host := range n.currBootstrap.Hosts {
-		if host.IP() != buddyIP {
-			continue
-		} else if len(host.Garage.Instances) > 0 {
-			n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
-			return nil
-		}
-		break
-	}
-
-	// Either the host is no longer in the network, or it no longer has any
-	// garage instances set on it. Either way, remove its nodes from the cluster
-	// layout.
-
-	buddyNodeIDs := make([]string, len(buddyNodes))
-	for i, buddyNode := range buddyNodes {
-		buddyNodeIDs[i] = buddyNode.ID
-	}
-
-	n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
-	if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
-		return fmt.Errorf(
-			"applying garage cluster layout, removing nodes %+v: %w",
-			buddyNodes,
-			err,
-		)
-	}
-
-	return nil
-}
-
 // returns the bootstrap prior to the reload being applied.
 func (n *network) reload(
 	ctx context.Context,
--- a/go/daemon/network/network_it_test.go
+++ b/go/daemon/network/network_it_test.go
@ -1,15 +1,18 @@
 package network

 import (
+	"fmt"
 	"isle/bootstrap"
 	"isle/daemon/daecommon"
 	"isle/garage"
 	"isle/garage/garagesrv"
 	"isle/jsonutil"
 	"isle/nebula"
+	"isle/toolkit"
 	"os"
 	"path/filepath"
 	"testing"
+	"time"

 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@ -71,7 +74,7 @@ func TestJoin(t *testing.T) {
 		assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t))
 	})

-	t.Run("with alloc", func(t *testing.T) {
+	t.Run("with alloc/simple", func(t *testing.T) {
 		var (
 			h        = newIntegrationHarness(t)
 			primus   = h.createNetwork(t, "primus", nil)
@ -86,6 +89,61 @@ func TestJoin(t *testing.T) {
 		assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))

 		assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t))
+		assertGarageLayout(t, map[*integrationHarnessNetwork]int{
+			primus:   3,
+			secondus: 1,
+		})
+	})
+
+	// Assert that if primus runs the orphan remover at the same moment that
+	// secondus is joining that the layout applied by secondus doesn't get
+	// overwritten.
+	t.Run("with alloc/remove orphans after garage layout applied", func(t *testing.T) {
+		t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly")
+
+		var (
+			h                 = newIntegrationHarness(t)
+			primus            = h.createNetwork(t, "primus", nil)
+			primusAdminClient = primus.garageAdminClient(t)
+			secondusBlocker   = toolkit.NewTestBlocker(t)
+		)
+
+		secondusBlocker.ExpectBlockpoint("garageLayoutApplied").On(
+			t, h.ctx, func() {
+				h.logger.Info(h.ctx, "Waiting for new layout to propagate to primus")
+				err := toolkit.UntilTrue(
+					h.ctx, h.logger, 1*time.Second, func() (bool, error) {
+						layout, err := primusAdminClient.GetLayout(h.ctx)
+						if err != nil {
+							return false, fmt.Errorf("getting layout: %w", err)
+						}
+
+						return len(layout.Roles) == 4, nil
+					},
+				)
+
+				if !assert.NoError(t, err) {
+					return
+				}
+
+				//h.logger.Info(h.ctx, "Calling removeOrphanGarageNodes")
+				//assert.NoError(
+				//	t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
+				//)
+			},
+		)
+
+		secondus := h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
+			networkConfigOpts: &networkConfigOpts{
+				numStorageAllocs: 1,
+			},
+			blocker: secondusBlocker,
+		})
+
+		assertGarageLayout(t, map[*integrationHarnessNetwork]int{
+			primus:   3,
+			secondus: 1,
+		})
 	})
 }

@ -285,6 +343,8 @@ func TestNetwork_SetConfig(t *testing.T) {
 	})

 	t.Run("remove all storage allocs", func(t *testing.T) {
+		t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly")
+
 		var (
 			h        = newIntegrationHarness(t)
 			primus   = h.createNetwork(t, "primus", nil)
@ -319,10 +379,10 @@ func TestNetwork_SetConfig(t *testing.T) {
 		assert.NoError(t, err)
 		assert.Contains(t, layout.Roles, removedRole)

-		t.Log("Removing orphan garage nodes with primus")
-		assert.NoError(
-			t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
-		)
+		//t.Log("Removing orphan garage nodes with primus")
+		//assert.NoError(
+		//	t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
+		//)

 		t.Log("Checking that garage layout no longer contains the old allocation")
 		layout, err = primusGarageAdminClient.GetLayout(h.ctx)
--- a/go/daemon/network/network_it_util_test.go
+++ b/go/daemon/network/network_it_util_test.go
@ -1,6 +1,7 @@
 package network

 import (
+	"cmp"
 	"context"
 	"fmt"
 	"isle/bootstrap"
@ -11,6 +12,7 @@ import (
 	"isle/toolkit"
 	"os"
 	"path/filepath"
+	"slices"
 	"sync"
 	"sync/atomic"
 	"testing"
@ -251,6 +253,7 @@ type joinNetworkOpts struct {
 	*networkConfigOpts
 	canCreateHosts bool
 	manualShutdown bool
+	blocker        *toolkit.TestBlocker
 }

 func (o *joinNetworkOpts) withDefaults() *joinNetworkOpts {
@ -288,6 +291,7 @@ func (h *integrationHarness) joinNetwork(
 		networkOpts   = &Opts{
 			GarageAdminToken: "admin_token",
 			Config:           &networkConfig,
+			testBlocker:      opts.blocker,
 		}
 	)

@ -382,3 +386,56 @@ func (nh *integrationHarnessNetwork) getHostsByName(
 	require.NoError(t, err)
 	return currBootstrap.Hosts
 }
+
+func assertGarageLayout(
+	t *testing.T,
+	wantLayout map[*integrationHarnessNetwork]int, // network -> num allocs
+) {
+	wantLayoutSimple := map[string]int{}
+	for nh, wantAllocs := range wantLayout {
+		wantLayoutSimple[string(nh.hostName)] = wantAllocs
+	}
+
+	normalizeLayout := func(layout *garage.ClusterLayout) {
+		slices.SortFunc(layout.Roles, func(a, b garage.Role) int {
+			return cmp.Compare(a.ID, b.ID)
+		})
+	}
+
+	assertSingle := func(
+		nh *integrationHarnessNetwork, layout garage.ClusterLayout,
+	) {
+		gotLayoutSimple := map[string]int{}
+		for _, role := range layout.Roles {
+			gotLayoutSimple[role.Zone]++
+		}
+		assert.Equal(t, wantLayoutSimple, gotLayoutSimple, "layout from %q", nh.hostName)
+	}
+
+	var (
+		lastLayoutHostName nebula.HostName
+		lastLayout         garage.ClusterLayout
+	)
+
+	for nh := range wantLayout {
+		layout, err := nh.garageAdminClient(t).GetLayout(nh.ctx)
+		assert.NoError(t, err)
+
+		normalizeLayout(&layout)
+		assertSingle(nh, layout)
+
+		if lastLayoutHostName != "" {
+			assert.Equal(
+				t,
+				lastLayout,
+				layout,
+				"layout of %q not equal to layout of %q",
+				lastLayoutHostName,
+				nh.hostName,
+			)
+		}
+
+		lastLayoutHostName = nh.hostName
+		lastLayout = layout
+	}
+}
--- a/go/toolkit/testutils_blocker.go
+++ b/go/toolkit/testutils_blocker.go
@ -0,0 +1,147 @@
+package toolkit
+
+import (
+	"context"
+	"sync"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// ExpectedBlockpoint represents the expectation that Blockpoint will be called
+// on a TestBlocker. It is possible to both wait for the Blockpoint call to
+// occur and to unblock it once it has occured.
+type ExpectedBlockpoint struct {
+	waitCh    chan struct{}
+	unblockCh chan struct{}
+}
+
+// Wait will block until blockpoint has been hit and is itself blocking, or will
+// return the context error.
+func (eb ExpectedBlockpoint) Wait(ctx context.Context) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-eb.waitCh:
+		return nil
+	}
+}
+
+// Unblock unblocks the Blockpoint call which is/was expected. If Unblock can be
+// called prior to Wait being called (and therefore prior to the Blockpoint
+// being hit).
+func (eb ExpectedBlockpoint) Unblock() {
+	close(eb.unblockCh)
+}
+
+// On is a helper which will spawn a go-routine, call Wait on the
+// ExpectedBlockpoint, call the given callback, and then Unblock the
+// ExpectedBlockpoint.
+//
+// If Wait returns an error (due to context cancellation) then this fails the
+// test and returns without calling the callback.
+func (eb ExpectedBlockpoint) On(t *testing.T, ctx context.Context, cb func()) {
+	go func() {
+		defer eb.Unblock()
+		if !assert.NoError(t, eb.Wait(ctx)) {
+			return
+		}
+		cb()
+	}()
+}
+
+// TestBlocker is used as an injected dependency into components, so that tests
+// can cause those components to block at specific execution points internally.
+// This is useful for testing race conditions between multiple components.
+//
+// A TestBlocker is initialized using `new`. A nil TestBlocker will never block.
+type TestBlocker struct {
+	l               sync.Mutex
+	blockpointsByID map[string][]ExpectedBlockpoint
+	blocksByID      map[string]int
+}
+
+// NewTestBlocker initializes a TestBlocker and registers a Cleanup callback on
+// the T which will call AssertExpectations.
+func NewTestBlocker(t *testing.T) *TestBlocker {
+	b := new(TestBlocker)
+	t.Cleanup(func() { b.AssertExpectations(t) })
+	return b
+}
+
+// Blockpoint will block if and only if TestBlocker is non-nil and
+// ExpectBlockpoint has been called with the same ID previously. If the context
+// is canceled while blocking then this call will return.
+func (b *TestBlocker) Blockpoint(ctx context.Context, id string) {
+	if b == nil {
+		return
+	}
+
+	b.l.Lock()
+
+	blockpoints := b.blockpointsByID[id]
+	if len(blockpoints) == 0 {
+		b.l.Unlock()
+		return
+	}
+
+	blockpoint, blockpoints := blockpoints[0], blockpoints[1:]
+	b.blockpointsByID[id] = blockpoints
+	b.blocksByID[id]++
+
+	b.l.Unlock()
+
+	close(blockpoint.waitCh)
+
+	select {
+	case <-ctx.Done():
+	case <-blockpoint.unblockCh:
+	}
+}
+
+// ExpectBlockpoint will cause the TestBlocker to block upon the next call to
+// Blockpoint using the same id. The returned ExpectBlockpoint can be used to
+// wait until Blockpoint is called, as well as to unblock it.
+func (b *TestBlocker) ExpectBlockpoint(id string) ExpectedBlockpoint {
+	b.l.Lock()
+	defer b.l.Unlock()
+
+	if b.blockpointsByID == nil {
+		b.blockpointsByID = map[string][]ExpectedBlockpoint{}
+	}
+
+	if b.blocksByID == nil {
+		b.blocksByID = map[string]int{}
+	}
+
+	blockpoint := ExpectedBlockpoint{
+		waitCh:    make(chan struct{}),
+		unblockCh: make(chan struct{}),
+	}
+
+	b.blockpointsByID[id] = append(b.blockpointsByID[id], blockpoint)
+
+	return blockpoint
+}
+
+// AssertExpectations will Fail the test and return false if any calls to
+// ExpectBlockpoint have not had a corresponding Blockpoint call made.
+func (b *TestBlocker) AssertExpectations(t *testing.T) bool {
+	b.l.Lock()
+	defer b.l.Unlock()
+
+	var failed bool
+	for id, blockpoints := range b.blockpointsByID {
+		if len(blockpoints) == 0 {
+			continue
+		}
+
+		failed = true
+		t.Errorf(
+			"Blockpoint(%q) called %d times, expected %d more",
+			id, b.blocksByID[id], len(blockpoints),
+		)
+	}
+
+	return !failed
+}
--- a/tasks/bugs/garage-apply-layout-before-stopping-instance.md
+++ b/tasks/bugs/garage-apply-layout-before-stopping-instance.md
@ -1,14 +0,0 @@
---
-type: task
---
-
-When removing a storage allocation the new layout should be applied _before_ the
-impacted garage instance is shut down. Isle should ideally also wait for the
-impacted instance to no longer be in the "draining" state prior to shutting it
-down, if possible.
-
-Some care needs to be taken in the case of the `daemon.yml` file being used for
-configuration. The daemon should probably initially load using the old
-configuration, and only then apply the new configuration as if it were applied
-using `SetConfig`. This way the garage instance being removed can be brought
-back up, drained, then shut down again.
--- a/tasks/bugs/garage-layout-management.md
+++ b/tasks/bugs/garage-layout-management.md
@ -0,0 +1,84 @@
+---
+type: task
+---
+
+## Problem
+
+There are high-level but extremely problematic issues with how garage layout
+management is being done.
+
+In general the strategy around layout management is that each host only modifies
+the cluster layout related to itself, and never touches applied roles of other
+hosts. This works great for all except one case: a host removing one or more of
+its allocations.
+
+There are two separate issues which must be dealt with, each related partially
+to the other.
+
+## Draining of garage data
+
+When a garage node is removed from the cluster it first goes into the "draining"
+state, so that other nodes in the cluster can ensure that the replication factor
+for each piece of data is met prior to the node being decommissioned.
+
+While the node is in draining state it cannot be used for S3 API calls, as the
+bucket credentials are no longer present on it.
+
+## Configuration change on restart
+
+For hosts whose configuration is managed by `daemon.yml` it is not necessarily
+known that a garage node used to exist at all upon restart. The host can't
+investigate the cluster layout because it won't have a garage instance running,
+and even if it could it wouldn't be able to bring up a garage node to properly
+drain the old allocations.
+
+# Invalid Solutions
+
+One solution which is tempting but ultimately NOT viable is to make all hosts
+run at least one garage instance, and if they have no storage allocations to
+make that instance be a "gateway" instance. This is won't work though, because
+it would require all hosts to open up the RPC port on their firewall, and
+firewall management requires extra user involvement.
+
+Another previous solution was to use an "orphan remover" process on each host,
+where the host would compare the garage cluster layout to the expected layout
+based on the bootstrap data in the common bucket, and remove any hosts from the
+layout which shouldn't be there and don't have a garage instance to remove
+themselves with. This had a bunch of unresolveable race conditions, and it
+didn't account for draining besides.
+
+# Possible Solution
+
+The solution seems to be that the host must maintain two views of its garage
+allocations: the last known allocation state, and the desired allocation state.
+
+The last known state needs to contain both what state the allocation was in
+(healthy or draining), along with its directories and capacity. This should get
+updated anytime the host performs an action which changes it (modifying the
+cluster layout to add a new instance or move an existing one to draining, or
+actually removing an instance which is done draining).
+
+The desired state is essentially the network configuration as it is now. This
+will be used along with the last known state to take actions.
+
+There are a few details to note with this solution:
+
+- There will need to be a worker which periodically checks the last known state
+  for any nodes which were draining, and if they are done draining then remove
+  them.
+
+- When the host starts up it should _always_ use the last known state, and only
+  once started up should it go to apply the desired configuration.
+
+- When choosing an admin endpoint to use the last known state should be used,
+  even though it might result in unexpected behavior from the user's perspective
+  (since the user only knows about the desired state). This applies for RPC
+  endpoints as well.
+
+- The last/desired states need to be checked for conflicts, and an error emitted
+  in the event that there is one (either returned from SetConfig or Load). This
+  includes a new allocation using the same directory as an old one (based on RPC
+  port), or two allocations using the same RPC port.
+
+- The nebula firewall must base its opened ports on the last known state rather
+  than desired state.
--- a/tasks/bugs/garage-orphan-remover-race-condition.md
+++ b/tasks/bugs/garage-orphan-remover-race-condition.md
@ -1,8 +0,0 @@
---
-type: task
---
-
-There is a race condition in the garage orphan removal worker. Basically when
-host A is adding its first allocation, it's possible that it hasn't yet updated
-the host info in the common bucket, and so host B removes it from the layout
-prior to that host info being updated.
--- a/tasks/bugs/garage-remove-then-re-add-alloc.md
+++ b/tasks/bugs/garage-remove-then-re-add-alloc.md
@ -1,14 +0,0 @@
---
-type: task
-after:
-  - ./garage-apply-layout-before-stopping-instance.md
---
-
-I think there is currently a bug related to re-adding a storage allocation on an
-RPC port of a garage instance which was previously used:
-
- Step 1) Remove a storage allocation using RPC port N
- Step 2) Add a new allocation using RPC port N, but with a new data/meta dir
-
-I believe in this case garage will go back to using the old data/meta dir, and
-possibly even re-use the old pubkey.
--- a/tasks/bugs/set-config-dont-commit-new-config-on-err.md
+++ b/tasks/bugs/set-config-dont-commit-new-config-on-err.md
@ -1,7 +1,7 @@
 ---
 type: task
 after:
-  - ./garage-apply-layout-before-stopping-instance.md
+  - ./garage-layout-management.md
 ---

 When SetConfig is called, but ends up erroring, the new config should not end up