Have hosts update garage cluster layout to remove other nodes if necessary

2024-11-05 22:31:57 +01:00 · 2024-11-05 22:31:57 +01:00 · 1f5286dbb4
commit 1f5286dbb4
parent 53a06af9ba
6 changed files with 306 additions and 62 deletions
--- a/docs/operator/contributing-storage.md
+++ b/docs/operator/contributing-storage.md
@ -1,8 +1,8 @@
 # Contributing Storage

-If your host machine can be reasonably sure of being online most, if not all, of
-the time, and has 1GB or more of unused drive space you'd like to contribute to
-the network, then this document is for you.
+This document is for you if your host machine can be reliably be online at all
+times and has 1GB or more of unused drive space you'd like to contribute to the
+network.

 ## Edit `daemon.yml`

@ -16,7 +16,7 @@ one allocation listed.
 The comments in the file should be self-explanatory, but ask your admin if you
 need any clarification.

-Here are an example set of allocations for a host which is contributing space
+Here is an example set of allocations for a host which is contributing space
 from two separate drives:

 ```
@ -47,6 +47,17 @@ process.
 The `isle daemon` will automatically allow the ports used for your
 storage allocations in the vpn firewall.

+## Removing Allocations
+
+If you later decide to no longer provide storage simply remove the
+`storage.allocations` item from your `/etc/isle/daemon.yml` file and restart the
+`isle daemon` process.
+
+Once removed, it is advisable to wait some time before turning off the daemon or
+deleting the data from the data and meta directories which were previously
+allocated to. This gives other hosts in the network time to offload all data
+that yours was previously hosting.
+
 ## Further Reading

 Isle uses the [garage][garage] project for its storage system. See the
--- a/go/daemon/children/children.go
+++ b/go/daemon/children/children.go
@ -184,7 +184,6 @@ func (c *Children) reloadNebula(
 	return nil
 }

-// TODO this doesn't handle removing garage nodes
 func (c *Children) reloadGarage(
 	ctx context.Context,
 	networkConfig daecommon.NetworkConfig,
@ -206,6 +205,8 @@ func (c *Children) reloadGarage(
 			)
 		)

+		// TODO it's possible that the config changed, but only the bootstrap
+		// peers, in which case we don't need to restart the node.
 		childConfigPath, changed, err := garageWriteChildConfig(
 			ctx,
 			c.logger,
--- a/go/daemon/network/garage.go
+++ b/go/daemon/network/garage.go
@ -12,13 +12,16 @@ import (
 	"isle/nebula"
 	"isle/secrets"
 	"net"
+	"net/netip"
 	"path/filepath"
+	"slices"
 	"strconv"
 	"time"

 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
 	"github.com/minio/minio-go/v7"
+	"golang.org/x/exp/maps"
 )

 // Paths within garage's global bucket.
@ -341,3 +344,59 @@ func garageWaitForAlloc(
 		return nil
 	}
 }
+
+// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
+// the given garage cluster status. It will return zero values if the host has
+// no buddy.
+//
+// For situations where we want one host to affect the cluster layout of another
+// host's peers, we use a simple system to determine a single host which is
+// responsible. The goal is not to be 100% race-proof (garage handles that), but
+// rather to try to prevent all hosts from modifying the same host's layout at
+// the same time.
+//
+// The system is to order all hosts by their IP, and say that each host is
+// responsible for (aka the "buddy" of) the host immediately after their own in
+// that list. The last host in that list is responsible for the first.
+func garageNodeBuddyPeers(
+	status garage.ClusterStatus, host bootstrap.Host,
+) (
+	netip.Addr, []garage.Role,
+) {
+	var (
+		thisIP        = host.IP()
+		nodeRolesByIP = map[netip.Addr][]garage.Role{}
+	)
+
+	for _, node := range status.Nodes {
+		if node.Role == nil {
+			continue
+		}
+
+		ip := node.Addr.Addr()
+		nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role)
+	}
+
+	// If there is only a single host in the cluster (or, somehow, none) then
+	// that host has no buddy.
+	if len(nodeRolesByIP) < 2 {
+		return netip.Addr{}, nil
+	}
+
+	nodeIPs := maps.Keys(nodeRolesByIP)
+	slices.SortFunc(nodeIPs, netip.Addr.Compare)
+
+	for i, nodeIP := range nodeIPs {
+		var buddyIP netip.Addr
+		if i == len(nodeIPs)-1 {
+			buddyIP = nodeIPs[0]
+		} else if nodeIP == thisIP {
+			buddyIP = nodeIPs[i+1]
+		} else {
+			continue
+		}
+		return buddyIP, nodeRolesByIP[buddyIP]
+	}
+
+	panic("Unreachable")
+}
--- a/go/daemon/network/network.go
+++ b/go/daemon/network/network.go
@ -419,7 +419,7 @@ func (n *network) initializeDirs(mayExist bool) error {
 }

 func (n *network) periodically(
-	logger *mlog.Logger,
+	label string,
 	fn func(context.Context) error,
 	period time.Duration,
 ) {
@ -427,13 +427,13 @@ func (n *network) periodically(
 	go func() {
 		defer n.wg.Done()

-		ctx := mctx.Annotate(n.workerCtx, "period", period)
+		ctx := mctx.Annotate(n.workerCtx, "workerLabel", label)

 		ticker := time.NewTicker(period)
 		defer ticker.Stop()

-		logger.Info(ctx, "Starting background job runner")
-		defer logger.Info(ctx, "Stopping background job runner")
+		n.logger.Info(ctx, "Starting background job runner")
+		defer n.logger.Info(ctx, "Stopping background job runner")

 		for {
 			select {
@ -441,9 +441,9 @@ func (n *network) periodically(
 				return

 			case <-ticker.C:
-				logger.Info(ctx, "Background job running")
+				n.logger.Info(ctx, "Background job running")
 				if err := fn(ctx); err != nil {
-					logger.Error(ctx, "Background job failed", err)
+					n.logger.Error(ctx, "Background job failed", err)
 				}
 			}
 		}
@ -510,10 +510,10 @@ func (n *network) initialize(
 		return fmt.Errorf("Reloading network bootstrap: %w", err)
 	}

+	n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)
+
 	n.periodically(
-		n.logger.WithNamespace("reloadHosts"),
-		n.reloadHosts,
-		3*time.Minute,
+		"removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
 	)

 	return nil
@ -531,7 +531,7 @@ func (n *network) postChildrenInit(

 	thisHost := n.currBootstrap.ThisHost()

-	if len(prevThisHost.Garage.Instances)+len(thisHost.Garage.Instances) > 0 {
+	if len(thisHost.Garage.Instances) > 0 {
 		n.logger.Info(ctx, "Applying garage layout")
 		if err := garageApplyLayout(
 			ctx,
@ -618,6 +618,77 @@ func (n *network) reloadHosts(ctx context.Context) error {
 	return nil
 }

+// In general each host will manage the garage cluster layout of its own storage
+// allocations via garageApplyLayout. There are three situations which are
+// handled here, rather than garageApplyLayout:
+//
+//   - A host removes all of its allocations via SetConfig.
+//   - A host removes all of its allocations by calling Load with no allocations
+//     in the provided daecommon.NetworkConfig.
+//   - A host is removed from the network by another host.
+//
+// In all of these cases the host no longer has any garage instances running,
+// and so can't call garageApplyLayout on itself. To combat this we have all
+// hosts which do have garage instances running periodically check that there's
+// not some garage nodes orphaned in the cluster layout, and remove them if so.
+func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
+	n.l.RLock()
+	defer n.l.RUnlock()
+
+	thisHost := n.currBootstrap.ThisHost()
+	if len(thisHost.Garage.Instances) == 0 {
+		n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
+		return nil
+	}
+
+	adminClient := newGarageAdminClient(
+		n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
+	)
+	defer adminClient.Close()
+
+	clusterStatus, err := adminClient.Status(ctx)
+	if err != nil {
+		return fmt.Errorf("retrieving garage cluster status: %w", err)
+	}
+
+	buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
+	if len(buddyNodes) == 0 {
+		return nil
+	}
+
+	ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
+
+	for _, host := range n.currBootstrap.Hosts {
+		if host.IP() != buddyIP {
+			continue
+		} else if len(host.Garage.Instances) > 0 {
+			n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
+			return nil
+		}
+		break
+	}
+
+	// Either the host is no longer in the network, or it no longer has any
+	// garage instances set on it. Either way, remove its nodes from the cluster
+	// layout.
+
+	buddyNodeIDs := make([]string, len(buddyNodes))
+	for i, buddyNode := range buddyNodes {
+		buddyNodeIDs[i] = buddyNode.ID
+	}
+
+	n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
+	if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
+		return fmt.Errorf(
+			"applying garage cluster layout, removing nodes %+v: %w",
+			buddyNodes,
+			err,
+		)
+	}
+
+	return nil
+}
+
 // returns the bootstrap prior to the reload being applied.
 func (n *network) reload(
 	ctx context.Context,
--- a/go/daemon/network/network_it_test.go
+++ b/go/daemon/network/network_it_test.go
@ -54,19 +54,44 @@ func TestLoad(t *testing.T) {
 }

 func TestJoin(t *testing.T) {
-	var (
-		h        = newIntegrationHarness(t)
-		primus   = h.createNetwork(t, "primus", nil)
-		secondus = h.joinNetwork(t, primus, "secondus", nil)
-	)
+	t.Run("simple", func(t *testing.T) {
+		var (
+			h        = newIntegrationHarness(t)
+			primus   = h.createNetwork(t, "primus", nil)
+			secondus = h.joinNetwork(t, primus, "secondus", nil)
+		)

-	primusHosts, err := primus.GetHosts(h.ctx)
-	assert.NoError(t, err)
+		primusHosts, err := primus.GetHosts(h.ctx)
+		assert.NoError(t, err)

-	secondusHosts, err := secondus.GetHosts(h.ctx)
-	assert.NoError(t, err)
+		secondusHosts, err := secondus.GetHosts(h.ctx)
+		assert.NoError(t, err)

-	assert.Equal(t, primusHosts, secondusHosts)
+		assert.Equal(t, primusHosts, secondusHosts)
+	})
+
+	t.Run("with alloc", func(t *testing.T) {
+		var (
+			h        = newIntegrationHarness(t)
+			primus   = h.createNetwork(t, "primus", nil)
+			secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
+				networkConfigOpts: &networkConfigOpts{
+					numStorageAllocs: 1,
+				},
+			})
+		)
+
+		t.Log("reloading primus' hosts")
+		assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))
+
+		primusHosts, err := primus.GetHosts(h.ctx)
+		assert.NoError(t, err)
+
+		secondusHosts, err := secondus.GetHosts(h.ctx)
+		assert.NoError(t, err)
+
+		assert.Equal(t, primusHosts, secondusHosts)
+	})
 }

 func TestNetwork_GetConfig(t *testing.T) {
@ -189,5 +214,59 @@ func TestNetwork_SetConfig(t *testing.T) {
 		assert.ElementsMatch(t, expRoles, layout.Roles)
 	})

-	// TODO a host having allocs but removing all of them
+	t.Run("remove all storage allocs", func(t *testing.T) {
+		var (
+			h        = newIntegrationHarness(t)
+			primus   = h.createNetwork(t, "primus", nil)
+			secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
+				networkConfigOpts: &networkConfigOpts{
+					numStorageAllocs: 1,
+				},
+			})
+			networkConfig = secondus.getConfig(t)
+
+			prevHost     = secondus.getHostsByName(t)[secondus.hostName]
+			removedAlloc = networkConfig.Storage.Allocations[0]
+			removedRole  = allocsToRoles(
+				secondus.hostName, prevHost.Garage.Instances,
+			)[0]
+			removedGarageInst = daecommon.BootstrapGarageHostForAlloc(
+				prevHost, removedAlloc,
+			)
+
+			primusGarageAdminClient = primus.garageAdminClient(t)
+		)
+
+		networkConfig.Storage.Allocations = nil
+		assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig))
+
+		t.Log("Checking that the Host information was updated")
+		newHostsByName := primus.getHostsByName(t)
+		newHost, ok := newHostsByName[secondus.hostName]
+		assert.True(t, ok)
+
+		allocs := newHost.HostConfigured.Garage.Instances
+		assert.Len(t, allocs, 3)
+		assert.NotContains(t, allocs, removedGarageInst)
+
+		t.Log("Checking that garage layout still contains the old allocation")
+		layout, err := primusGarageAdminClient.GetLayout(h.ctx)
+		assert.NoError(t, err)
+		assert.Contains(t, layout.Roles, removedRole)
+
+		t.Log("Removing orphan garage nodes with primus")
+		assert.NoError(
+			t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
+		)
+
+		t.Log("Checking that garage layout no longer contains the old allocation")
+		layout, err = primusGarageAdminClient.GetLayout(h.ctx)
+		assert.NoError(t, err)
+		assert.NotContains(t, layout.Roles, removedRole)
+	})
+
+	// TODO make sure that if two nodes each have 3, and one removes all 3, that
+	// the other is able to retain availability of all data. This means that the
+	// garage nodes on the node which removed all allocs need to stay online
+	// some time, in order to replicate data to the leftover nodes.
 }
--- a/go/garage/admin_client.go
+++ b/go/garage/admin_client.go
@ -8,6 +8,7 @@ import (
 	"io"
 	"net/http"
 	"net/http/httputil"
+	"net/netip"
 	"time"

 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
@ -158,38 +159,81 @@ func (c *AdminClient) do(
 	return nil
 }

+// KnownNode describes the fields of a known node in the cluster, as returned
+// as part of [ClusterStatus].
+type KnownNode struct {
+	ID              string         `json:"id"`
+	Role            *Role          `json:"role"`
+	Addr            netip.AddrPort `json:"addr"`
+	IsUp            bool           `json:"isUp"`
+	LastSeenSecsAgo int            `json:"lastSeenSecsAgo"`
+	HostName        string         `json:"hostname"`
+}
+
+// Role descibes a node's role in the garage cluster, i.e. what storage it is
+// providing.
+type Role struct {
+	ID       string   `json:"id"`
+	Capacity int      `json:"capacity"` // Gb (SI units)
+	Zone     string   `json:"zone"`
+	Tags     []string `json:"tags"`
+}
+
+// ClusterLayout describes the layout of the cluster as a whole.
+type ClusterLayout struct {
+	Version           int    `json:"version"`
+	Roles             []Role `json:"roles"`
+	StagedRoleChanges []Role `json:"stagedRoleChanges"`
+}
+
+// ClusterStatus is returned from the Status endpoint, describing the currently
+// known state of the cluster.
+type ClusterStatus struct {
+	Nodes []KnownNode `json:"nodes"`
+}
+
+// Status returns the current state of the cluster.
+func (c *AdminClient) Status(ctx context.Context) (ClusterStatus, error) {
+	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
+	var clusterStatus ClusterStatus
+	err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
+	return clusterStatus, err
+}
+
 // Wait will block until the instance connected to can see at least
 // ReplicationFactor other garage instances. If the context is canceled it
 // will return the context error.
 func (c *AdminClient) Wait(ctx context.Context) error {

 	for first := true; ; first = false {
-
 		if !first {
-			time.Sleep(250 * time.Millisecond)
+			select {
+			case <-time.After(2 * time.Second):
+			case <-ctx.Done():
+				return ctx.Err()
+			}
 		}

-		// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
-		var clusterStatus struct {
-			Nodes []struct {
-				IsUp bool `json:"isUp"`
-			} `json:"nodes"`
-		}
-
-		err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
-
+		c.logger.Debug(ctx, "Getting cluster status")
+		clusterStatus, err := c.Status(ctx)
 		if ctxErr := ctx.Err(); ctxErr != nil {
 			return ctxErr

 		} else if err != nil {
-			c.logger.Warn(ctx, "waiting for instance to become ready", err)
+			ctx := mctx.Annotate(ctx, "errMsg", err.Error())
+			c.logger.Info(ctx, "Instance is not online yet")
 			continue
 		}

 		var numUp int

 		for _, node := range clusterStatus.Nodes {
-			if node.IsUp {
+			// There seems to be some kind of step between IsUp becoming true
+			// and garage actually loading the full state of a node, so we check
+			// for the HostName as well. We could also use LastSeenSecsAgo, but
+			// that remains null for the node being queried so it's more
+			// annoying to use.
+			if node.IsUp && node.HostName != "" {
 				numUp++
 			}
 		}
@ -204,7 +248,7 @@ func (c *AdminClient) Wait(ctx context.Context) error {
 			return nil
 		}

-		c.logger.Debug(ctx, "instance not online yet, will continue waiting")
+		c.logger.Info(ctx, "Instance is not joined to the cluster yet")
 	}
 }

@ -283,20 +327,6 @@ func (c *AdminClient) GrantBucketPermissions(
 	})
 }

-// Role descibes a node's role in the garage cluster, i.e. what storage it is
-// providing.
-type Role struct {
-	ID       string   `json:"id"`
-	Capacity int      `json:"capacity"` // Gb (SI units)
-	Zone     string   `json:"zone"`
-	Tags     []string `json:"tags"`
-}
-
-// ClusterLayout describes the layout of the cluster as a whole.
-type ClusterLayout struct {
-	Roles []Role `json:"roles"`
-}
-
 // GetLayout returns the currently applied ClusterLayout.
 func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) {
 	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout
@ -323,13 +353,7 @@ func (c *AdminClient) ApplyLayout(
 		roles = append(roles, removeRole{ID: id, Remove: true})
 	}

-	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout
-	var clusterLayout struct {
-		Version           int    `json:"version"`
-		StagedRoleChanges []Role `json:"stagedRoleChanges"`
-	}
-
-	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
+	var clusterLayout ClusterLayout
 	err := c.do(ctx, &clusterLayout, "POST", "/v1/layout", roles)
 	if err != nil {
 		return fmt.Errorf("staging layout changes: %w", err)
@ -337,7 +361,6 @@ func (c *AdminClient) ApplyLayout(
 		return nil
 	}

-	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
 	applyClusterLayout := struct {
 		Version int `json:"version"`
 	}{