WIP

2024-11-05 22:31:57 +01:00
5 changed files with 272 additions and 48 deletions
--- a/go/daemon/children/children.go
+++ b/go/daemon/children/children.go
@ -184,7 +184,6 @@ func (c *Children) reloadNebula(
 	return nil
 }

-// TODO this doesn't handle removing garage nodes
 func (c *Children) reloadGarage(
 	ctx context.Context,
 	networkConfig daecommon.NetworkConfig,
@ -206,6 +205,8 @@ func (c *Children) reloadGarage(
 			)
 		)

+		// TODO it's possible that the config changed, but only the bootstrap
+		// peers, in which case we don't need to restart the node.
 		childConfigPath, changed, err := garageWriteChildConfig(
 			ctx,
 			c.logger,
--- a/go/daemon/network/garage.go
+++ b/go/daemon/network/garage.go
@ -12,13 +12,16 @@ import (
 	"isle/nebula"
 	"isle/secrets"
 	"net"
+	"net/netip"
 	"path/filepath"
+	"slices"
 	"strconv"
 	"time"

 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
 	"github.com/minio/minio-go/v7"
+	"golang.org/x/exp/maps"
 )

 // Paths within garage's global bucket.
@ -119,6 +122,8 @@ func garageApplyLayout(
 		}
 	}

+	_, _ = adminClient.Status(ctx) // TODO remove this
+
 	return adminClient.ApplyLayout(ctx, peers, idsToRemove)
 }

@ -341,3 +346,67 @@ func garageWaitForAlloc(
 		return nil
 	}
 }
+
+// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
+// the given garage cluster status. It will return zero values if the host has
+// no buddy.
+//
+// For situations where we want one host to affect the cluster layout of another
+// host's peers, we use a simple system to determine a single host which is
+// responsible. The goal is not to be 100% race-proof (garage handles that), but
+// rather to try to prevent all hosts from modifying the same host's layout at
+// the same time.
+//
+// The system is to order all hosts by their IP, and say that each host is
+// responsible for (aka the "buddy" of) the host immediately after their own in
+// that list. The last host in that list is responsible for the first.
+func garageNodeBuddyPeers(
+	status garage.ClusterStatus, host bootstrap.Host,
+) (
+	netip.Addr, []garage.PeerLayout,
+) {
+	var (
+		thisIP    = host.IP()
+		peersByID = make(
+			map[string]garage.PeerLayout, len(status.Layout.Peers),
+		)
+		nodePeersByIP = map[netip.Addr][]garage.PeerLayout{}
+	)
+
+	for _, peer := range status.Layout.Peers {
+		peersByID[peer.ID] = peer
+	}
+
+	for _, node := range status.Nodes {
+		peer, ok := peersByID[node.ID]
+		if !ok {
+			continue
+		}
+
+		ip := node.Addr.Addr()
+		nodePeersByIP[ip] = append(nodePeersByIP[ip], peer)
+	}
+
+	// If there is only a single host in the cluster (or, somehow, none) then
+	// that host has no buddy.
+	if len(nodePeersByIP) < 2 {
+		return netip.Addr{}, nil
+	}
+
+	nodeIPs := maps.Keys(nodePeersByIP)
+	slices.SortFunc(nodeIPs, netip.Addr.Compare)
+
+	for i, nodeIP := range nodeIPs {
+		var buddyIP netip.Addr
+		if i == len(nodeIPs)-1 {
+			buddyIP = nodeIPs[0]
+		} else if nodeIP == thisIP {
+			buddyIP = nodeIPs[i+1]
+		} else {
+			continue
+		}
+		return buddyIP, nodePeersByIP[buddyIP]
+	}
+
+	panic("Unreachable")
+}
--- a/go/daemon/network/network.go
+++ b/go/daemon/network/network.go
@ -419,7 +419,7 @@ func (n *network) initializeDirs(mayExist bool) error {
 }

 func (n *network) periodically(
-	logger *mlog.Logger,
+	label string,
 	fn func(context.Context) error,
 	period time.Duration,
 ) {
@ -427,13 +427,13 @@ func (n *network) periodically(
 	go func() {
 		defer n.wg.Done()

-		ctx := mctx.Annotate(n.workerCtx, "period", period)
+		ctx := mctx.Annotate(n.workerCtx, "workerLabel", label)

 		ticker := time.NewTicker(period)
 		defer ticker.Stop()

-		logger.Info(ctx, "Starting background job runner")
-		defer logger.Info(ctx, "Stopping background job runner")
+		n.logger.Info(ctx, "Starting background job runner")
+		defer n.logger.Info(ctx, "Stopping background job runner")

 		for {
 			select {
@ -441,9 +441,9 @@ func (n *network) periodically(
 				return

 			case <-ticker.C:
-				logger.Info(ctx, "Background job running")
+				n.logger.Info(ctx, "Background job running")
 				if err := fn(ctx); err != nil {
-					logger.Error(ctx, "Background job failed", err)
+					n.logger.Error(ctx, "Background job failed", err)
 				}
 			}
 		}
@ -510,10 +510,10 @@ func (n *network) initialize(
 		return fmt.Errorf("Reloading network bootstrap: %w", err)
 	}

+	n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)
+
 	n.periodically(
-		n.logger.WithNamespace("reloadHosts"),
-		n.reloadHosts,
-		3*time.Minute,
+		"removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
 	)

 	return nil
@ -618,6 +618,77 @@ func (n *network) reloadHosts(ctx context.Context) error {
 	return nil
 }

+// In general each host will manage the garage cluster layout of its own storage
+// allocations via garageApplyLayout. There are three situations which are
+// handled here, rather than garageApplyLayout:
+//
+//   - A host removes all of its allocations via SetConfig.
+//   - A host removes all of its allocations by calling Load with no allocations
+//     in the provided daecommon.NetworkConfig.
+//   - A host is removed from the network by another host.
+//
+// In all of these cases the host no longer has any garage instances running,
+// and so can't call garageApplyLayout on itself. To combat this we have all
+// hosts which do have garage instances running periodically check that there's
+// not some garage nodes orphaned in the cluster layout, and remove them if so.
+func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
+	n.l.RLock()
+	defer n.l.RUnlock()
+
+	thisHost := n.currBootstrap.ThisHost()
+	if len(thisHost.Garage.Instances) == 0 {
+		n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
+		return nil
+	}
+
+	adminClient := newGarageAdminClient(
+		n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
+	)
+	defer adminClient.Close()
+
+	clusterStatus, err := adminClient.Status(ctx)
+	if err != nil {
+		return fmt.Errorf("retrieving garage cluster status: %w", err)
+	}
+
+	buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
+	if len(buddyNodes) == 0 {
+		return nil
+	}
+
+	ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
+
+	for _, host := range n.currBootstrap.Hosts {
+		if host.IP() != buddyIP {
+			continue
+		} else if len(host.Garage.Instances) > 0 {
+			n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
+			return nil
+		}
+		break
+	}
+
+	// Either the host is no longer in the network, or it no longer has any
+	// garage instances set on it. Either way, remove its nodes from the cluster
+	// layout.
+
+	buddyNodeIDs := make([]string, len(buddyNodes))
+	for i, buddyNode := range buddyNodes {
+		buddyNodeIDs[i] = buddyNode.ID
+	}
+
+	n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
+	if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
+		return fmt.Errorf(
+			"applying garage cluster layout, removing nodes %+v: %w",
+			buddyNodes,
+			err,
+		)
+	}
+
+	return nil
+}
+
 // returns the bootstrap prior to the reload being applied.
 func (n *network) reload(
 	ctx context.Context,
--- a/go/daemon/network/network_it_test.go
+++ b/go/daemon/network/network_it_test.go
@ -54,19 +54,44 @@ func TestLoad(t *testing.T) {
 }

 func TestJoin(t *testing.T) {
-	var (
-		h        = newIntegrationHarness(t)
-		primus   = h.createNetwork(t, "primus", nil)
-		secondus = h.joinNetwork(t, primus, "secondus", nil)
-	)
+	t.Run("simple", func(t *testing.T) {
+		var (
+			h        = newIntegrationHarness(t)
+			primus   = h.createNetwork(t, "primus", nil)
+			secondus = h.joinNetwork(t, primus, "secondus", nil)
+		)

-	primusHosts, err := primus.GetHosts(h.ctx)
-	assert.NoError(t, err)
+		primusHosts, err := primus.GetHosts(h.ctx)
+		assert.NoError(t, err)

-	secondusHosts, err := secondus.GetHosts(h.ctx)
-	assert.NoError(t, err)
+		secondusHosts, err := secondus.GetHosts(h.ctx)
+		assert.NoError(t, err)

-	assert.Equal(t, primusHosts, secondusHosts)
+		assert.Equal(t, primusHosts, secondusHosts)
+	})
+
+	t.Run("with alloc", func(t *testing.T) {
+		var (
+			h        = newIntegrationHarness(t)
+			primus   = h.createNetwork(t, "primus", nil)
+			secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
+				networkConfigOpts: &networkConfigOpts{
+					numStorageAllocs: 1,
+				},
+			})
+		)
+
+		t.Log("reloading primus' hosts")
+		assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))
+
+		primusHosts, err := primus.GetHosts(h.ctx)
+		assert.NoError(t, err)
+
+		secondusHosts, err := secondus.GetHosts(h.ctx)
+		assert.NoError(t, err)
+
+		assert.Equal(t, primusHosts, secondusHosts)
+	})
 }

 func TestNetwork_GetConfig(t *testing.T) {
@ -189,5 +214,33 @@ func TestNetwork_SetConfig(t *testing.T) {
 		assert.ElementsMatch(t, expPeers, layout.Peers)
 	})

-	// TODO a host having allocs but removing all of them
+	t.Run("remove all storage allocs", func(t *testing.T) {
+		var (
+			h        = newIntegrationHarness(t)
+			primus   = h.createNetwork(t, "primus", nil)
+			secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
+				networkConfigOpts: &networkConfigOpts{
+					numStorageAllocs: 1,
+				},
+			})
+			networkConfig = secondus.getConfig(t)
+
+			prevHost = secondus.getHostsByName(t)[secondus.hostName]
+			//removedAlloc = networkConfig.Storage.Allocations[0]
+			removedPeer = allocsToPeerLayouts(
+				secondus.hostName, prevHost.Garage.Instances,
+			)[0]
+			//removedGarageInst = daecommon.BootstrapGarageHostForAlloc(
+			//	prevHost, removedAlloc,
+			//)
+		)
+
+		networkConfig.Storage.Allocations = nil
+		assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig))
+
+		t.Log("Checking that garage layout still contains the old allocation")
+		layout, err := secondus.garageAdminClient(t).GetLayout(h.ctx)
+		assert.NoError(t, err)
+		assert.Contains(t, layout.Peers, removedPeer)
+	})
 }
--- a/go/garage/admin_client.go
+++ b/go/garage/admin_client.go
@ -8,6 +8,7 @@ import (
 	"io"
 	"net/http"
 	"net/http/httputil"
+	"net/netip"
 	"time"

 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
@ -158,6 +159,47 @@ func (c *AdminClient) do(
 	return nil
 }

+// KnownNode describes the fields of a known node in the cluster, as returned
+// as part of [ClusterStatus].
+type KnownNode struct {
+	ID              string         `json:"id"`
+	Addr            netip.AddrPort `json:"addr"`
+	IsUp            bool           `json:"isUp"`
+	LastSeenSecsAgo int            `json:"lastSeenSecsAgo"`
+	HostName        string         `json:"hostname"`
+}
+
+// PeerLayout describes the properties of a garage peer in the context of the
+// layout of the cluster.
+//
+// TODO This should be called Role.
+type PeerLayout struct {
+	ID       string   `json:"id"`
+	Capacity int      `json:"capacity"` // Gb (SI units)
+	Zone     string   `json:"zone"`
+	Tags     []string `json:"tags"`
+}
+
+// ClusterLayout describes the layout of the cluster as a whole.
+type ClusterLayout struct {
+	Peers []PeerLayout `json:"roles"`
+}
+
+// ClusterStatus is returned from the Status endpoint, describing the currently
+// known state of the cluster.
+type ClusterStatus struct {
+	Nodes  []KnownNode   `json:"nodes"`
+	Layout ClusterLayout `json:"layout"`
+}
+
+// Status returns the current state of the cluster.
+func (c *AdminClient) Status(ctx context.Context) (ClusterStatus, error) {
+	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
+	var clusterStatus ClusterStatus
+	err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
+	return clusterStatus, err
+}
+
 // Wait will block until the instance connected to can see at least
 // ReplicationFactor other garage instances. If the context is canceled it
 // will return the context error.
@ -166,30 +208,32 @@ func (c *AdminClient) Wait(ctx context.Context) error {
 	for first := true; ; first = false {

 		if !first {
-			time.Sleep(250 * time.Millisecond)
+			select {
+			case <-time.After(500 * time.Millisecond):
+			case <-ctx.Done():
+				return ctx.Err()
+			}
 		}

-		// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
-		var clusterStatus struct {
-			Nodes []struct {
-				IsUp bool `json:"isUp"`
-			} `json:"nodes"`
-		}
-
-		err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
-
+		c.logger.Debug(ctx, "Getting cluster status")
+		clusterStatus, err := c.Status(ctx)
 		if ctxErr := ctx.Err(); ctxErr != nil {
 			return ctxErr

 		} else if err != nil {
-			c.logger.Warn(ctx, "waiting for instance to become ready", err)
+			c.logger.Warn(ctx, "Instance is not yet ready", err)
 			continue
 		}

 		var numUp int

 		for _, node := range clusterStatus.Nodes {
-			if node.IsUp {
+			// There seems to be some kind of step between IsUp becoming true
+			// and garage actually loading the full state of a node, so we check
+			// for the HostName as well. We could also use LastSeenSecsAgo, but
+			// that remains null for the node being queried so it's more
+			// annoying to use.
+			if node.IsUp && node.HostName != "" {
 				numUp++
 			}
 		}
@ -204,7 +248,7 @@ func (c *AdminClient) Wait(ctx context.Context) error {
 			return nil
 		}

-		c.logger.Debug(ctx, "instance not online yet, will continue waiting")
+		c.logger.Debug(ctx, "Instance not online yet, will continue waiting")
 	}
 }

@ -283,20 +327,6 @@ func (c *AdminClient) GrantBucketPermissions(
 	})
 }

-// PeerLayout describes the properties of a garage peer in the context of the
-// layout of the cluster.
-type PeerLayout struct {
-	ID       string   `json:"id"`
-	Capacity int      `json:"capacity"` // Gb (SI units)
-	Zone     string   `json:"zone"`
-	Tags     []string `json:"tags"`
-}
-
-// ClusterLayout describes the layout of the cluster as a whole.
-type ClusterLayout struct {
-	Peers []PeerLayout `json:"roles"`
-}
-
 // GetLayout returns the currently applied ClusterLayout.
 func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) {
 	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout