Get rid of orphan removal after investigation into race conditions, new solution pending

2025-01-02 14:08:24 +01:00 · 2025-01-02 14:08:24 +01:00 · 9e508ef4e2
commit 9e508ef4e2
parent e3d4fc5a8e
10 changed files with 358 additions and 176 deletions
--- a/go/daemon/network/garage.go
+++ b/go/daemon/network/garage.go
@ -13,16 +13,13 @@ import (
 	"isle/secrets"
 	"isle/toolkit"
 	"net"
 	"net/netip"
 	"path/filepath"
 	"slices"
 	"strconv"
 	"time"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
 	"github.com/minio/minio-go/v7"
 	"golang.org/x/exp/maps"
 )
 // Paths within garage's global bucket.
@ -218,6 +215,7 @@ func getGarageBootstrapHosts(
 		host, err := authedHost.Unwrap(currBootstrap.CAPublicCredentials)
 		if err != nil {
 			logger.Warn(ctx, "Host could not be authenticated", err)
 			continue
 		}
 		hosts[host.Name] = host
@ -363,59 +361,3 @@ func garageWaitForAlloc(
 	return nil
 }
 // garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
 // the given garage cluster status. It will return zero values if the host has
 // no buddy.
 //
 // For situations where we want one host to affect the cluster layout of another
 // host's peers, we use a simple system to determine a single host which is
 // responsible. The goal is not to be 100% race-proof (garage handles that), but
 // rather to try to prevent all hosts from modifying the same host's layout at
 // the same time.
 //
 // The system is to order all hosts by their IP, and say that each host is
 // responsible for (aka the "buddy" of) the host immediately after their own in
 // that list. The last host in that list is responsible for the first.
 func garageNodeBuddyPeers(
 	status garage.ClusterStatus, host bootstrap.Host,
 ) (
 	netip.Addr, []garage.Role,
 ) {
 	var (
 		thisIP        = host.IP()
 		nodeRolesByIP = map[netip.Addr][]garage.Role{}
 	)
 	for _, node := range status.Nodes {
 		if node.Role == nil {
 			continue
 		}
 		ip := node.Addr.Addr()
 		nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role)
 	}
 	// If there is only a single host in the cluster (or, somehow, none) then
 	// that host has no buddy.
 	if len(nodeRolesByIP) < 2 {
 		return netip.Addr{}, nil
 	}
 	nodeIPs := maps.Keys(nodeRolesByIP)
 	slices.SortFunc(nodeIPs, netip.Addr.Compare)
 	for i, nodeIP := range nodeIPs {
 		var buddyIP netip.Addr
 		if i == len(nodeIPs)-1 {
 			buddyIP = nodeIPs[0]
 		} else if nodeIP == thisIP {
 			buddyIP = nodeIPs[i+1]
 		} else {
 			continue
 		}
 		return buddyIP, nodeRolesByIP[buddyIP]
 	}
 	panic("Unreachable")
 }
--- a/go/daemon/network/network.go
+++ b/go/daemon/network/network.go
@ -160,6 +160,9 @@ type Opts struct {
 	// used, either that which it was most recently initialized with or which
 	// was passed to [SetConfig].
 	Config *daecommon.NetworkConfig
 	// testBlocker is used by tests to set blockpoints.
 	testBlocker *toolkit.TestBlocker
 }
 func (o *Opts) withDefaults() *Opts {
@ -508,10 +511,6 @@ func (n *network) initialize(
 	n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)
 	n.periodically(
 		"removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
 	)
 	return nil
 }
@ -603,77 +602,6 @@ func (n *network) reloadHosts(ctx context.Context) error {
 	return nil
 }
 // In general each host will manage the garage cluster layout of its own storage
 // allocations via garageApplyLayout. There are three situations which are
 // handled here, rather than garageApplyLayout:
 //
 //   - A host removes all of its allocations via SetConfig.
 //   - A host removes all of its allocations by calling Load with no allocations
 //     in the provided daecommon.NetworkConfig.
 //   - A host is removed from the network by another host.
 //
 // In all of these cases the host no longer has any garage instances running,
 // and so can't call garageApplyLayout on itself. To combat this we have all
 // hosts which do have garage instances running periodically check that there's
 // not some garage nodes orphaned in the cluster layout, and remove them if so.
 func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
 	n.l.RLock()
 	defer n.l.RUnlock()
 	thisHost := n.currBootstrap.ThisHost()
 	if len(thisHost.Garage.Instances) == 0 {
 		n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
 		return nil
 	}
 	adminClient := newGarageAdminClient(
 		n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
 	)
 	defer adminClient.Close()
 	clusterStatus, err := adminClient.Status(ctx)
 	if err != nil {
 		return fmt.Errorf("retrieving garage cluster status: %w", err)
 	}
 	buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
 	if len(buddyNodes) == 0 {
 		return nil
 	}
 	ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
 	for _, host := range n.currBootstrap.Hosts {
 		if host.IP() != buddyIP {
 			continue
 		} else if len(host.Garage.Instances) > 0 {
 			n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
 			return nil
 		}
 		break
 	}
 	// Either the host is no longer in the network, or it no longer has any
 	// garage instances set on it. Either way, remove its nodes from the cluster
 	// layout.
 	buddyNodeIDs := make([]string, len(buddyNodes))
 	for i, buddyNode := range buddyNodes {
 		buddyNodeIDs[i] = buddyNode.ID
 	}
 	n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
 	if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
 		return fmt.Errorf(
 			"applying garage cluster layout, removing nodes %+v: %w",
 			buddyNodes,
 			err,
 		)
 	}
 	return nil
 }
 // returns the bootstrap prior to the reload being applied.
 func (n *network) reload(
 	ctx context.Context,
--- a/go/daemon/network/network_it_test.go
+++ b/go/daemon/network/network_it_test.go
@ -1,15 +1,18 @@
 package network
 import (
 	"fmt"
 	"isle/bootstrap"
 	"isle/daemon/daecommon"
 	"isle/garage"
 	"isle/garage/garagesrv"
 	"isle/jsonutil"
 	"isle/nebula"
 	"isle/toolkit"
 	"os"
 	"path/filepath"
 	"testing"
 	"time"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@ -71,7 +74,7 @@ func TestJoin(t *testing.T) {
 		assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t))
 	})
-	t.Run("with alloc", func(t *testing.T) {
+	t.Run("with alloc/simple", func(t *testing.T) {
 		var (
 			h        = newIntegrationHarness(t)
 			primus   = h.createNetwork(t, "primus", nil)
@ -86,6 +89,61 @@ func TestJoin(t *testing.T) {
 		assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))
 		assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t))
 		assertGarageLayout(t, map[*integrationHarnessNetwork]int{
 			primus:   3,
 			secondus: 1,
 		})
 	})
 	// Assert that if primus runs the orphan remover at the same moment that
 	// secondus is joining that the layout applied by secondus doesn't get
 	// overwritten.
 	t.Run("with alloc/remove orphans after garage layout applied", func(t *testing.T) {
 		t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly")
 		var (
 			h                 = newIntegrationHarness(t)
 			primus            = h.createNetwork(t, "primus", nil)
 			primusAdminClient = primus.garageAdminClient(t)
 			secondusBlocker   = toolkit.NewTestBlocker(t)
 		)
 		secondusBlocker.ExpectBlockpoint("garageLayoutApplied").On(
 			t, h.ctx, func() {
 				h.logger.Info(h.ctx, "Waiting for new layout to propagate to primus")
 				err := toolkit.UntilTrue(
 					h.ctx, h.logger, 1*time.Second, func() (bool, error) {
 						layout, err := primusAdminClient.GetLayout(h.ctx)
 						if err != nil {
 							return false, fmt.Errorf("getting layout: %w", err)
 						}
 						return len(layout.Roles) == 4, nil
 					},
 				)
 				if !assert.NoError(t, err) {
 					return
 				}
 				//h.logger.Info(h.ctx, "Calling removeOrphanGarageNodes")
 				//assert.NoError(
 				//	t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
 				//)
 			},
 		)
 		secondus := h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
 			networkConfigOpts: &networkConfigOpts{
 				numStorageAllocs: 1,
 			},
 			blocker: secondusBlocker,
 		})
 		assertGarageLayout(t, map[*integrationHarnessNetwork]int{
 			primus:   3,
 			secondus: 1,
 		})
 	})
 }
@ -285,6 +343,8 @@ func TestNetwork_SetConfig(t *testing.T) {
 	})
 	t.Run("remove all storage allocs", func(t *testing.T) {
 		t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly")
 		var (
 			h        = newIntegrationHarness(t)
 			primus   = h.createNetwork(t, "primus", nil)
@ -319,10 +379,10 @@ func TestNetwork_SetConfig(t *testing.T) {
 		assert.NoError(t, err)
 		assert.Contains(t, layout.Roles, removedRole)
-		t.Log("Removing orphan garage nodes with primus")
+		//t.Log("Removing orphan garage nodes with primus")
-		assert.NoError(
+		//assert.NoError(
-			t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
+		//	t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
-		)
+		//)
 		t.Log("Checking that garage layout no longer contains the old allocation")
 		layout, err = primusGarageAdminClient.GetLayout(h.ctx)
--- a/go/daemon/network/network_it_util_test.go
+++ b/go/daemon/network/network_it_util_test.go
@ -1,6 +1,7 @@
 package network
 import (
 	"cmp"
 	"context"
 	"fmt"
 	"isle/bootstrap"
@ -11,6 +12,7 @@ import (
 	"isle/toolkit"
 	"os"
 	"path/filepath"
 	"slices"
 	"sync"
 	"sync/atomic"
 	"testing"
@ -251,6 +253,7 @@ type joinNetworkOpts struct {
 	*networkConfigOpts
 	canCreateHosts bool
 	manualShutdown bool
 	blocker        *toolkit.TestBlocker
 }
 func (o *joinNetworkOpts) withDefaults() *joinNetworkOpts {
@ -288,6 +291,7 @@ func (h *integrationHarness) joinNetwork(
 		networkOpts   = &Opts{
 			GarageAdminToken: "admin_token",
 			Config:           &networkConfig,
 			testBlocker:      opts.blocker,
 		}
 	)
@ -382,3 +386,56 @@ func (nh *integrationHarnessNetwork) getHostsByName(
 	require.NoError(t, err)
 	return currBootstrap.Hosts
 }
 func assertGarageLayout(
 	t *testing.T,
 	wantLayout map[*integrationHarnessNetwork]int, // network -> num allocs
 ) {
 	wantLayoutSimple := map[string]int{}
 	for nh, wantAllocs := range wantLayout {
 		wantLayoutSimple[string(nh.hostName)] = wantAllocs
 	}
 	normalizeLayout := func(layout *garage.ClusterLayout) {
 		slices.SortFunc(layout.Roles, func(a, b garage.Role) int {
 			return cmp.Compare(a.ID, b.ID)
 		})
 	}
 	assertSingle := func(
 		nh *integrationHarnessNetwork, layout garage.ClusterLayout,
 	) {
 		gotLayoutSimple := map[string]int{}
 		for _, role := range layout.Roles {
 			gotLayoutSimple[role.Zone]++
 		}
 		assert.Equal(t, wantLayoutSimple, gotLayoutSimple, "layout from %q", nh.hostName)
 	}
 	var (
 		lastLayoutHostName nebula.HostName
 		lastLayout         garage.ClusterLayout
 	)
 	for nh := range wantLayout {
 		layout, err := nh.garageAdminClient(t).GetLayout(nh.ctx)
 		assert.NoError(t, err)
 		normalizeLayout(&layout)
 		assertSingle(nh, layout)
 		if lastLayoutHostName != "" {
 			assert.Equal(
 				t,
 				lastLayout,
 				layout,
 				"layout of %q not equal to layout of %q",
 				lastLayoutHostName,
 				nh.hostName,
 			)
 		}
 		lastLayoutHostName = nh.hostName
 		lastLayout = layout
 	}
 }
--- a/go/toolkit/testutils_blocker.go
+++ b/go/toolkit/testutils_blocker.go
@ -0,0 +1,147 @@
 package toolkit
 import (
 	"context"
 	"sync"
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 // ExpectedBlockpoint represents the expectation that Blockpoint will be called
 // on a TestBlocker. It is possible to both wait for the Blockpoint call to
 // occur and to unblock it once it has occured.
 type ExpectedBlockpoint struct {
 	waitCh    chan struct{}
 	unblockCh chan struct{}
 }
 // Wait will block until blockpoint has been hit and is itself blocking, or will
 // return the context error.
 func (eb ExpectedBlockpoint) Wait(ctx context.Context) error {
 	select {
 	case <-ctx.Done():
 		return ctx.Err()
 	case <-eb.waitCh:
 		return nil
 	}
 }
 // Unblock unblocks the Blockpoint call which is/was expected. If Unblock can be
 // called prior to Wait being called (and therefore prior to the Blockpoint
 // being hit).
 func (eb ExpectedBlockpoint) Unblock() {
 	close(eb.unblockCh)
 }
 // On is a helper which will spawn a go-routine, call Wait on the
 // ExpectedBlockpoint, call the given callback, and then Unblock the
 // ExpectedBlockpoint.
 //
 // If Wait returns an error (due to context cancellation) then this fails the
 // test and returns without calling the callback.
 func (eb ExpectedBlockpoint) On(t *testing.T, ctx context.Context, cb func()) {
 	go func() {
 		defer eb.Unblock()
 		if !assert.NoError(t, eb.Wait(ctx)) {
 			return
 		}
 		cb()
 	}()
 }
 // TestBlocker is used as an injected dependency into components, so that tests
 // can cause those components to block at specific execution points internally.
 // This is useful for testing race conditions between multiple components.
 //
 // A TestBlocker is initialized using `new`. A nil TestBlocker will never block.
 type TestBlocker struct {
 	l               sync.Mutex
 	blockpointsByID map[string][]ExpectedBlockpoint
 	blocksByID      map[string]int
 }
 // NewTestBlocker initializes a TestBlocker and registers a Cleanup callback on
 // the T which will call AssertExpectations.
 func NewTestBlocker(t *testing.T) *TestBlocker {
 	b := new(TestBlocker)
 	t.Cleanup(func() { b.AssertExpectations(t) })
 	return b
 }
 // Blockpoint will block if and only if TestBlocker is non-nil and
 // ExpectBlockpoint has been called with the same ID previously. If the context
 // is canceled while blocking then this call will return.
 func (b *TestBlocker) Blockpoint(ctx context.Context, id string) {
 	if b == nil {
 		return
 	}
 	b.l.Lock()
 	blockpoints := b.blockpointsByID[id]
 	if len(blockpoints) == 0 {
 		b.l.Unlock()
 		return
 	}
 	blockpoint, blockpoints := blockpoints[0], blockpoints[1:]
 	b.blockpointsByID[id] = blockpoints
 	b.blocksByID[id]++
 	b.l.Unlock()
 	close(blockpoint.waitCh)
 	select {
 	case <-ctx.Done():
 	case <-blockpoint.unblockCh:
 	}
 }
 // ExpectBlockpoint will cause the TestBlocker to block upon the next call to
 // Blockpoint using the same id. The returned ExpectBlockpoint can be used to
 // wait until Blockpoint is called, as well as to unblock it.
 func (b *TestBlocker) ExpectBlockpoint(id string) ExpectedBlockpoint {
 	b.l.Lock()
 	defer b.l.Unlock()
 	if b.blockpointsByID == nil {
 		b.blockpointsByID = map[string][]ExpectedBlockpoint{}
 	}
 	if b.blocksByID == nil {
 		b.blocksByID = map[string]int{}
 	}
 	blockpoint := ExpectedBlockpoint{
 		waitCh:    make(chan struct{}),
 		unblockCh: make(chan struct{}),
 	}
 	b.blockpointsByID[id] = append(b.blockpointsByID[id], blockpoint)
 	return blockpoint
 }
 // AssertExpectations will Fail the test and return false if any calls to
 // ExpectBlockpoint have not had a corresponding Blockpoint call made.
 func (b *TestBlocker) AssertExpectations(t *testing.T) bool {
 	b.l.Lock()
 	defer b.l.Unlock()
 	var failed bool
 	for id, blockpoints := range b.blockpointsByID {
 		if len(blockpoints) == 0 {
 			continue
 		}
 		failed = true
 		t.Errorf(
 			"Blockpoint(%q) called %d times, expected %d more",
 			id, b.blocksByID[id], len(blockpoints),
 		)
 	}
 	return !failed
 }
--- a/tasks/bugs/garage-apply-layout-before-stopping-instance.md
+++ b/tasks/bugs/garage-apply-layout-before-stopping-instance.md
@ -1,14 +0,0 @@
 ---
 type: task
 ---
 When removing a storage allocation the new layout should be applied _before_ the
 impacted garage instance is shut down. Isle should ideally also wait for the
 impacted instance to no longer be in the "draining" state prior to shutting it
 down, if possible.
 Some care needs to be taken in the case of the `daemon.yml` file being used for
 configuration. The daemon should probably initially load using the old
 configuration, and only then apply the new configuration as if it were applied
 using `SetConfig`. This way the garage instance being removed can be brought
 back up, drained, then shut down again.
--- a/tasks/bugs/garage-layout-management.md
+++ b/tasks/bugs/garage-layout-management.md
@ -0,0 +1,84 @@
 ---
 type: task
 ---
 ## Problem
 There are high-level but extremely problematic issues with how garage layout
 management is being done.
 In general the strategy around layout management is that each host only modifies
 the cluster layout related to itself, and never touches applied roles of other
 hosts. This works great for all except one case: a host removing one or more of
 its allocations.
 There are two separate issues which must be dealt with, each related partially
 to the other.
 ## Draining of garage data
 When a garage node is removed from the cluster it first goes into the "draining"
 state, so that other nodes in the cluster can ensure that the replication factor
 for each piece of data is met prior to the node being decommissioned.
 While the node is in draining state it cannot be used for S3 API calls, as the
 bucket credentials are no longer present on it.
 ## Configuration change on restart
 For hosts whose configuration is managed by `daemon.yml` it is not necessarily
 known that a garage node used to exist at all upon restart. The host can't
 investigate the cluster layout because it won't have a garage instance running,
 and even if it could it wouldn't be able to bring up a garage node to properly
 drain the old allocations.
 # Invalid Solutions
 One solution which is tempting but ultimately NOT viable is to make all hosts
 run at least one garage instance, and if they have no storage allocations to
 make that instance be a "gateway" instance. This is won't work though, because
 it would require all hosts to open up the RPC port on their firewall, and
 firewall management requires extra user involvement.
 Another previous solution was to use an "orphan remover" process on each host,
 where the host would compare the garage cluster layout to the expected layout
 based on the bootstrap data in the common bucket, and remove any hosts from the
 layout which shouldn't be there and don't have a garage instance to remove
 themselves with. This had a bunch of unresolveable race conditions, and it
 didn't account for draining besides.
 # Possible Solution
 The solution seems to be that the host must maintain two views of its garage
 allocations: the last known allocation state, and the desired allocation state.
 The last known state needs to contain both what state the allocation was in
 (healthy or draining), along with its directories and capacity. This should get
 updated anytime the host performs an action which changes it (modifying the
 cluster layout to add a new instance or move an existing one to draining, or
 actually removing an instance which is done draining).
 The desired state is essentially the network configuration as it is now. This
 will be used along with the last known state to take actions.
 There are a few details to note with this solution:
 - There will need to be a worker which periodically checks the last known state
  for any nodes which were draining, and if they are done draining then remove
  them.
 - When the host starts up it should _always_ use the last known state, and only
  once started up should it go to apply the desired configuration.
 - When choosing an admin endpoint to use the last known state should be used,
  even though it might result in unexpected behavior from the user's perspective
  (since the user only knows about the desired state). This applies for RPC
  endpoints as well.
 - The last/desired states need to be checked for conflicts, and an error emitted
  in the event that there is one (either returned from SetConfig or Load). This
  includes a new allocation using the same directory as an old one (based on RPC
  port), or two allocations using the same RPC port.
 - The nebula firewall must base its opened ports on the last known state rather
  than desired state.
--- a/tasks/bugs/garage-orphan-remover-race-condition.md
+++ b/tasks/bugs/garage-orphan-remover-race-condition.md
@ -1,8 +0,0 @@
 ---
 type: task
 ---
 There is a race condition in the garage orphan removal worker. Basically when
 host A is adding its first allocation, it's possible that it hasn't yet updated
 the host info in the common bucket, and so host B removes it from the layout
 prior to that host info being updated.
--- a/tasks/bugs/garage-remove-then-re-add-alloc.md
+++ b/tasks/bugs/garage-remove-then-re-add-alloc.md
@ -1,14 +0,0 @@
 ---
 type: task
 after:
  - ./garage-apply-layout-before-stopping-instance.md
 ---
 I think there is currently a bug related to re-adding a storage allocation on an
 RPC port of a garage instance which was previously used:
 - Step 1) Remove a storage allocation using RPC port N
 - Step 2) Add a new allocation using RPC port N, but with a new data/meta dir
 I believe in this case garage will go back to using the old data/meta dir, and
 possibly even re-use the old pubkey.
--- a/tasks/bugs/set-config-dont-commit-new-config-on-err.md
+++ b/tasks/bugs/set-config-dont-commit-new-config-on-err.md
@ -1,7 +1,7 @@
 ---
 type: task
 after:
-  - ./garage-apply-layout-before-stopping-instance.md
+  - ./garage-layout-management.md
 ---
 When SetConfig is called, but ends up erroring, the new config should not end up