Have hosts update garage cluster layout to remove other nodes if necessary

2024-11-05 22:31:57 +01:00 · 2024-11-05 22:31:57 +01:00 · 1f5286dbb4
commit 1f5286dbb4
parent 53a06af9ba
6 changed files with 306 additions and 62 deletions
--- a/docs/operator/contributing-storage.md
+++ b/docs/operator/contributing-storage.md
@ -1,8 +1,8 @@
 # Contributing Storage
-If your host machine can be reasonably sure of being online most, if not all, of
+This document is for you if your host machine can be reliably be online at all
-the time, and has 1GB or more of unused drive space you'd like to contribute to
+times and has 1GB or more of unused drive space you'd like to contribute to the
-the network, then this document is for you.
+network.
 ## Edit `daemon.yml`
@ -16,7 +16,7 @@ one allocation listed.
 The comments in the file should be self-explanatory, but ask your admin if you
 need any clarification.
-Here are an example set of allocations for a host which is contributing space
+Here is an example set of allocations for a host which is contributing space
 from two separate drives:
 ```
@ -47,6 +47,17 @@ process.
 The `isle daemon` will automatically allow the ports used for your
 storage allocations in the vpn firewall.
 ## Removing Allocations
 If you later decide to no longer provide storage simply remove the
 `storage.allocations` item from your `/etc/isle/daemon.yml` file and restart the
 `isle daemon` process.
 Once removed, it is advisable to wait some time before turning off the daemon or
 deleting the data from the data and meta directories which were previously
 allocated to. This gives other hosts in the network time to offload all data
 that yours was previously hosting.
 ## Further Reading
 Isle uses the [garage][garage] project for its storage system. See the
--- a/go/daemon/children/children.go
+++ b/go/daemon/children/children.go
@ -184,7 +184,6 @@ func (c *Children) reloadNebula(
 	return nil
 }
 // TODO this doesn't handle removing garage nodes
 func (c *Children) reloadGarage(
 	ctx context.Context,
 	networkConfig daecommon.NetworkConfig,
@ -206,6 +205,8 @@ func (c *Children) reloadGarage(
 			)
 		)
 		// TODO it's possible that the config changed, but only the bootstrap
 		// peers, in which case we don't need to restart the node.
 		childConfigPath, changed, err := garageWriteChildConfig(
 			ctx,
 			c.logger,
--- a/go/daemon/network/garage.go
+++ b/go/daemon/network/garage.go
@ -12,13 +12,16 @@ import (
 	"isle/nebula"
 	"isle/secrets"
 	"net"
 	"net/netip"
 	"path/filepath"
 	"slices"
 	"strconv"
 	"time"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
 	"github.com/minio/minio-go/v7"
 	"golang.org/x/exp/maps"
 )
 // Paths within garage's global bucket.
@ -341,3 +344,59 @@ func garageWaitForAlloc(
 		return nil
 	}
 }
 // garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
 // the given garage cluster status. It will return zero values if the host has
 // no buddy.
 //
 // For situations where we want one host to affect the cluster layout of another
 // host's peers, we use a simple system to determine a single host which is
 // responsible. The goal is not to be 100% race-proof (garage handles that), but
 // rather to try to prevent all hosts from modifying the same host's layout at
 // the same time.
 //
 // The system is to order all hosts by their IP, and say that each host is
 // responsible for (aka the "buddy" of) the host immediately after their own in
 // that list. The last host in that list is responsible for the first.
 func garageNodeBuddyPeers(
 	status garage.ClusterStatus, host bootstrap.Host,
 ) (
 	netip.Addr, []garage.Role,
 ) {
 	var (
 		thisIP        = host.IP()
 		nodeRolesByIP = map[netip.Addr][]garage.Role{}
 	)
 	for _, node := range status.Nodes {
 		if node.Role == nil {
 			continue
 		}
 		ip := node.Addr.Addr()
 		nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role)
 	}
 	// If there is only a single host in the cluster (or, somehow, none) then
 	// that host has no buddy.
 	if len(nodeRolesByIP) < 2 {
 		return netip.Addr{}, nil
 	}
 	nodeIPs := maps.Keys(nodeRolesByIP)
 	slices.SortFunc(nodeIPs, netip.Addr.Compare)
 	for i, nodeIP := range nodeIPs {
 		var buddyIP netip.Addr
 		if i == len(nodeIPs)-1 {
 			buddyIP = nodeIPs[0]
 		} else if nodeIP == thisIP {
 			buddyIP = nodeIPs[i+1]
 		} else {
 			continue
 		}
 		return buddyIP, nodeRolesByIP[buddyIP]
 	}
 	panic("Unreachable")
 }
--- a/go/daemon/network/network.go
+++ b/go/daemon/network/network.go
@ -419,7 +419,7 @@ func (n *network) initializeDirs(mayExist bool) error {
 }
 func (n *network) periodically(
-	logger *mlog.Logger,
+	label string,
 	fn func(context.Context) error,
 	period time.Duration,
 ) {
@ -427,13 +427,13 @@ func (n *network) periodically(
 	go func() {
 		defer n.wg.Done()
-		ctx := mctx.Annotate(n.workerCtx, "period", period)
+		ctx := mctx.Annotate(n.workerCtx, "workerLabel", label)
 		ticker := time.NewTicker(period)
 		defer ticker.Stop()
-		logger.Info(ctx, "Starting background job runner")
+		n.logger.Info(ctx, "Starting background job runner")
-		defer logger.Info(ctx, "Stopping background job runner")
+		defer n.logger.Info(ctx, "Stopping background job runner")
 		for {
 			select {
@ -441,9 +441,9 @@ func (n *network) periodically(
 				return
 			case <-ticker.C:
-				logger.Info(ctx, "Background job running")
+				n.logger.Info(ctx, "Background job running")
 				if err := fn(ctx); err != nil {
-					logger.Error(ctx, "Background job failed", err)
+					n.logger.Error(ctx, "Background job failed", err)
 				}
 			}
 		}
@ -510,10 +510,10 @@ func (n *network) initialize(
 		return fmt.Errorf("Reloading network bootstrap: %w", err)
 	}
 	n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)
 	n.periodically(
-		n.logger.WithNamespace("reloadHosts"),
+		"removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
 		n.reloadHosts,
 		3*time.Minute,
 	)
 	return nil
@ -531,7 +531,7 @@ func (n *network) postChildrenInit(
 	thisHost := n.currBootstrap.ThisHost()
-	if len(prevThisHost.Garage.Instances)+len(thisHost.Garage.Instances) > 0 {
+	if len(thisHost.Garage.Instances) > 0 {
 		n.logger.Info(ctx, "Applying garage layout")
 		if err := garageApplyLayout(
 			ctx,
@ -618,6 +618,77 @@ func (n *network) reloadHosts(ctx context.Context) error {
 	return nil
 }
 // In general each host will manage the garage cluster layout of its own storage
 // allocations via garageApplyLayout. There are three situations which are
 // handled here, rather than garageApplyLayout:
 //
 //   - A host removes all of its allocations via SetConfig.
 //   - A host removes all of its allocations by calling Load with no allocations
 //     in the provided daecommon.NetworkConfig.
 //   - A host is removed from the network by another host.
 //
 // In all of these cases the host no longer has any garage instances running,
 // and so can't call garageApplyLayout on itself. To combat this we have all
 // hosts which do have garage instances running periodically check that there's
 // not some garage nodes orphaned in the cluster layout, and remove them if so.
 func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
 	n.l.RLock()
 	defer n.l.RUnlock()
 	thisHost := n.currBootstrap.ThisHost()
 	if len(thisHost.Garage.Instances) == 0 {
 		n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
 		return nil
 	}
 	adminClient := newGarageAdminClient(
 		n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
 	)
 	defer adminClient.Close()
 	clusterStatus, err := adminClient.Status(ctx)
 	if err != nil {
 		return fmt.Errorf("retrieving garage cluster status: %w", err)
 	}
 	buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
 	if len(buddyNodes) == 0 {
 		return nil
 	}
 	ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
 	for _, host := range n.currBootstrap.Hosts {
 		if host.IP() != buddyIP {
 			continue
 		} else if len(host.Garage.Instances) > 0 {
 			n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
 			return nil
 		}
 		break
 	}
 	// Either the host is no longer in the network, or it no longer has any
 	// garage instances set on it. Either way, remove its nodes from the cluster
 	// layout.
 	buddyNodeIDs := make([]string, len(buddyNodes))
 	for i, buddyNode := range buddyNodes {
 		buddyNodeIDs[i] = buddyNode.ID
 	}
 	n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
 	if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
 		return fmt.Errorf(
 			"applying garage cluster layout, removing nodes %+v: %w",
 			buddyNodes,
 			err,
 		)
 	}
 	return nil
 }
 // returns the bootstrap prior to the reload being applied.
 func (n *network) reload(
 	ctx context.Context,
--- a/go/daemon/network/network_it_test.go
+++ b/go/daemon/network/network_it_test.go
@ -54,6 +54,7 @@ func TestLoad(t *testing.T) {
 }
 func TestJoin(t *testing.T) {
 	t.Run("simple", func(t *testing.T) {
 		var (
 			h        = newIntegrationHarness(t)
 			primus   = h.createNetwork(t, "primus", nil)
@ -67,6 +68,30 @@ func TestJoin(t *testing.T) {
 		assert.NoError(t, err)
 		assert.Equal(t, primusHosts, secondusHosts)
 	})
 	t.Run("with alloc", func(t *testing.T) {
 		var (
 			h        = newIntegrationHarness(t)
 			primus   = h.createNetwork(t, "primus", nil)
 			secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
 				networkConfigOpts: &networkConfigOpts{
 					numStorageAllocs: 1,
 				},
 			})
 		)
 		t.Log("reloading primus' hosts")
 		assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))
 		primusHosts, err := primus.GetHosts(h.ctx)
 		assert.NoError(t, err)
 		secondusHosts, err := secondus.GetHosts(h.ctx)
 		assert.NoError(t, err)
 		assert.Equal(t, primusHosts, secondusHosts)
 	})
 }
 func TestNetwork_GetConfig(t *testing.T) {
@ -189,5 +214,59 @@ func TestNetwork_SetConfig(t *testing.T) {
 		assert.ElementsMatch(t, expRoles, layout.Roles)
 	})
-	// TODO a host having allocs but removing all of them
+	t.Run("remove all storage allocs", func(t *testing.T) {
 		var (
 			h        = newIntegrationHarness(t)
 			primus   = h.createNetwork(t, "primus", nil)
 			secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
 				networkConfigOpts: &networkConfigOpts{
 					numStorageAllocs: 1,
 				},
 			})
 			networkConfig = secondus.getConfig(t)
 			prevHost     = secondus.getHostsByName(t)[secondus.hostName]
 			removedAlloc = networkConfig.Storage.Allocations[0]
 			removedRole  = allocsToRoles(
 				secondus.hostName, prevHost.Garage.Instances,
 			)[0]
 			removedGarageInst = daecommon.BootstrapGarageHostForAlloc(
 				prevHost, removedAlloc,
 			)
 			primusGarageAdminClient = primus.garageAdminClient(t)
 		)
 		networkConfig.Storage.Allocations = nil
 		assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig))
 		t.Log("Checking that the Host information was updated")
 		newHostsByName := primus.getHostsByName(t)
 		newHost, ok := newHostsByName[secondus.hostName]
 		assert.True(t, ok)
 		allocs := newHost.HostConfigured.Garage.Instances
 		assert.Len(t, allocs, 3)
 		assert.NotContains(t, allocs, removedGarageInst)
 		t.Log("Checking that garage layout still contains the old allocation")
 		layout, err := primusGarageAdminClient.GetLayout(h.ctx)
 		assert.NoError(t, err)
 		assert.Contains(t, layout.Roles, removedRole)
 		t.Log("Removing orphan garage nodes with primus")
 		assert.NoError(
 			t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
 		)
 		t.Log("Checking that garage layout no longer contains the old allocation")
 		layout, err = primusGarageAdminClient.GetLayout(h.ctx)
 		assert.NoError(t, err)
 		assert.NotContains(t, layout.Roles, removedRole)
 	})
 	// TODO make sure that if two nodes each have 3, and one removes all 3, that
 	// the other is able to retain availability of all data. This means that the
 	// garage nodes on the node which removed all allocs need to stay online
 	// some time, in order to replicate data to the leftover nodes.
 }
--- a/go/garage/admin_client.go
+++ b/go/garage/admin_client.go
@ -8,6 +8,7 @@ import (
 	"io"
 	"net/http"
 	"net/http/httputil"
 	"net/netip"
 	"time"
 	"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
@ -158,38 +159,81 @@ func (c *AdminClient) do(
 	return nil
 }
 // KnownNode describes the fields of a known node in the cluster, as returned
 // as part of [ClusterStatus].
 type KnownNode struct {
 	ID              string         `json:"id"`
 	Role            *Role          `json:"role"`
 	Addr            netip.AddrPort `json:"addr"`
 	IsUp            bool           `json:"isUp"`
 	LastSeenSecsAgo int            `json:"lastSeenSecsAgo"`
 	HostName        string         `json:"hostname"`
 }
 // Role descibes a node's role in the garage cluster, i.e. what storage it is
 // providing.
 type Role struct {
 	ID       string   `json:"id"`
 	Capacity int      `json:"capacity"` // Gb (SI units)
 	Zone     string   `json:"zone"`
 	Tags     []string `json:"tags"`
 }
 // ClusterLayout describes the layout of the cluster as a whole.
 type ClusterLayout struct {
 	Version           int    `json:"version"`
 	Roles             []Role `json:"roles"`
 	StagedRoleChanges []Role `json:"stagedRoleChanges"`
 }
 // ClusterStatus is returned from the Status endpoint, describing the currently
 // known state of the cluster.
 type ClusterStatus struct {
 	Nodes []KnownNode `json:"nodes"`
 }
 // Status returns the current state of the cluster.
 func (c *AdminClient) Status(ctx context.Context) (ClusterStatus, error) {
 	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
 	var clusterStatus ClusterStatus
 	err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
 	return clusterStatus, err
 }
 // Wait will block until the instance connected to can see at least
 // ReplicationFactor other garage instances. If the context is canceled it
 // will return the context error.
 func (c *AdminClient) Wait(ctx context.Context) error {
 	for first := true; ; first = false {
 		if !first {
-			time.Sleep(250 * time.Millisecond)
+			select {
 			case <-time.After(2 * time.Second):
 			case <-ctx.Done():
 				return ctx.Err()
 			}
 		}
-		// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
+		c.logger.Debug(ctx, "Getting cluster status")
-		var clusterStatus struct {
+		clusterStatus, err := c.Status(ctx)
 			Nodes []struct {
 				IsUp bool `json:"isUp"`
 			} `json:"nodes"`
 		}
 		err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
 		if ctxErr := ctx.Err(); ctxErr != nil {
 			return ctxErr
 		} else if err != nil {
-			c.logger.Warn(ctx, "waiting for instance to become ready", err)
+			ctx := mctx.Annotate(ctx, "errMsg", err.Error())
 			c.logger.Info(ctx, "Instance is not online yet")
 			continue
 		}
 		var numUp int
 		for _, node := range clusterStatus.Nodes {
-			if node.IsUp {
+			// There seems to be some kind of step between IsUp becoming true
 			// and garage actually loading the full state of a node, so we check
 			// for the HostName as well. We could also use LastSeenSecsAgo, but
 			// that remains null for the node being queried so it's more
 			// annoying to use.
 			if node.IsUp && node.HostName != "" {
 				numUp++
 			}
 		}
@ -204,7 +248,7 @@ func (c *AdminClient) Wait(ctx context.Context) error {
 			return nil
 		}
-		c.logger.Debug(ctx, "instance not online yet, will continue waiting")
+		c.logger.Info(ctx, "Instance is not joined to the cluster yet")
 	}
 }
@ -283,20 +327,6 @@ func (c *AdminClient) GrantBucketPermissions(
 	})
 }
 // Role descibes a node's role in the garage cluster, i.e. what storage it is
 // providing.
 type Role struct {
 	ID       string   `json:"id"`
 	Capacity int      `json:"capacity"` // Gb (SI units)
 	Zone     string   `json:"zone"`
 	Tags     []string `json:"tags"`
 }
 // ClusterLayout describes the layout of the cluster as a whole.
 type ClusterLayout struct {
 	Roles []Role `json:"roles"`
 }
 // GetLayout returns the currently applied ClusterLayout.
 func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) {
 	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout
@ -323,13 +353,7 @@ func (c *AdminClient) ApplyLayout(
 		roles = append(roles, removeRole{ID: id, Remove: true})
 	}
-	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout
+	var clusterLayout ClusterLayout
 	var clusterLayout struct {
 		Version           int    `json:"version"`
 		StagedRoleChanges []Role `json:"stagedRoleChanges"`
 	}
 	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
 	err := c.do(ctx, &clusterLayout, "POST", "/v1/layout", roles)
 	if err != nil {
 		return fmt.Errorf("staging layout changes: %w", err)
@ -337,7 +361,6 @@ func (c *AdminClient) ApplyLayout(
 		return nil
 	}
 	// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
 	applyClusterLayout := struct {
 		Version int `json:"version"`
 	}{