From 032bdb9e438bc2e5b4a19b105321ca6db34979f5 Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Tue, 5 Nov 2024 22:31:57 +0100 Subject: [PATCH] Have hosts update garage cluster layout to remove other nodes if necessary --- docs/operator/contributing-storage.md | 18 +++-- go/daemon/children/children.go | 3 +- go/daemon/network/garage.go | 59 ++++++++++++++++ go/daemon/network/network.go | 91 ++++++++++++++++++++++--- go/daemon/network/network_it_test.go | 96 ++++++++++++++++++++++++--- go/garage/admin_client.go | 95 ++++++++++++++++---------- 6 files changed, 300 insertions(+), 62 deletions(-) diff --git a/docs/operator/contributing-storage.md b/docs/operator/contributing-storage.md index 754e16d..b7ff290 100644 --- a/docs/operator/contributing-storage.md +++ b/docs/operator/contributing-storage.md @@ -1,8 +1,8 @@ # Contributing Storage -If your host machine can be reasonably sure of being online most, if not all, of -the time, and has 1GB or more of unused drive space you'd like to contribute to -the network, then this document is for you. +This document is for you if your host machine can be reliably be online at all +times and has 1GB or more of unused drive space you'd like to contribute to the +network. ## Edit `daemon.yml` @@ -16,7 +16,7 @@ one allocation listed. The comments in the file should be self-explanatory, but ask your admin if you need any clarification. -Here are an example set of allocations for a host which is contributing space +Here is an example set of allocations for a host which is contributing space from two separate drives: ``` @@ -47,6 +47,16 @@ process. The `isle daemon` will automatically allow the ports used for your storage allocations in the vpn firewall. +## Removing Allocations + +If you later decide to no longer provide storage simply remove the +`storage.allocations` item from your `/etc/isle/daemon.yml` file and restart the +`isle daemon` process. + +Once removed, it is advisable to wait some time before removing storage +allocations from other hosts. This ensures that all data which was previously +on this host has had a chance to fully replicate to multiple other hosts. + ## Further Reading Isle uses the [garage][garage] project for its storage system. See the diff --git a/go/daemon/children/children.go b/go/daemon/children/children.go index 5642ca7..6e00c2d 100644 --- a/go/daemon/children/children.go +++ b/go/daemon/children/children.go @@ -184,7 +184,6 @@ func (c *Children) reloadNebula( return nil } -// TODO this doesn't handle removing garage nodes func (c *Children) reloadGarage( ctx context.Context, networkConfig daecommon.NetworkConfig, @@ -206,6 +205,8 @@ func (c *Children) reloadGarage( ) ) + // TODO it's possible that the config changed, but only the bootstrap + // peers, in which case we don't need to restart the node. childConfigPath, changed, err := garageWriteChildConfig( ctx, c.logger, diff --git a/go/daemon/network/garage.go b/go/daemon/network/garage.go index faa18ab..28bff19 100644 --- a/go/daemon/network/garage.go +++ b/go/daemon/network/garage.go @@ -12,13 +12,16 @@ import ( "isle/nebula" "isle/secrets" "net" + "net/netip" "path/filepath" + "slices" "strconv" "time" "dev.mediocregopher.com/mediocre-go-lib.git/mctx" "dev.mediocregopher.com/mediocre-go-lib.git/mlog" "github.com/minio/minio-go/v7" + "golang.org/x/exp/maps" ) // Paths within garage's global bucket. @@ -341,3 +344,59 @@ func garageWaitForAlloc( return nil } } + +// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on +// the given garage cluster status. It will return zero values if the host has +// no buddy. +// +// For situations where we want one host to affect the cluster layout of another +// host's peers, we use a simple system to determine a single host which is +// responsible. The goal is not to be 100% race-proof (garage handles that), but +// rather to try to prevent all hosts from modifying the same host's layout at +// the same time. +// +// The system is to order all hosts by their IP, and say that each host is +// responsible for (aka the "buddy" of) the host immediately after their own in +// that list. The last host in that list is responsible for the first. +func garageNodeBuddyPeers( + status garage.ClusterStatus, host bootstrap.Host, +) ( + netip.Addr, []garage.Role, +) { + var ( + thisIP = host.IP() + nodeRolesByIP = map[netip.Addr][]garage.Role{} + ) + + for _, node := range status.Nodes { + if node.Role == nil { + continue + } + + ip := node.Addr.Addr() + nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role) + } + + // If there is only a single host in the cluster (or, somehow, none) then + // that host has no buddy. + if len(nodeRolesByIP) < 2 { + return netip.Addr{}, nil + } + + nodeIPs := maps.Keys(nodeRolesByIP) + slices.SortFunc(nodeIPs, netip.Addr.Compare) + + for i, nodeIP := range nodeIPs { + var buddyIP netip.Addr + if i == len(nodeIPs)-1 { + buddyIP = nodeIPs[0] + } else if nodeIP == thisIP { + buddyIP = nodeIPs[i+1] + } else { + continue + } + return buddyIP, nodeRolesByIP[buddyIP] + } + + panic("Unreachable") +} diff --git a/go/daemon/network/network.go b/go/daemon/network/network.go index d311762..5c85b48 100644 --- a/go/daemon/network/network.go +++ b/go/daemon/network/network.go @@ -419,7 +419,7 @@ func (n *network) initializeDirs(mayExist bool) error { } func (n *network) periodically( - logger *mlog.Logger, + label string, fn func(context.Context) error, period time.Duration, ) { @@ -427,13 +427,13 @@ func (n *network) periodically( go func() { defer n.wg.Done() - ctx := mctx.Annotate(n.workerCtx, "period", period) + ctx := mctx.Annotate(n.workerCtx, "workerLabel", label) ticker := time.NewTicker(period) defer ticker.Stop() - logger.Info(ctx, "Starting background job runner") - defer logger.Info(ctx, "Stopping background job runner") + n.logger.Info(ctx, "Starting background job runner") + defer n.logger.Info(ctx, "Stopping background job runner") for { select { @@ -441,9 +441,9 @@ func (n *network) periodically( return case <-ticker.C: - logger.Info(ctx, "Background job running") + n.logger.Info(ctx, "Background job running") if err := fn(ctx); err != nil { - logger.Error(ctx, "Background job failed", err) + n.logger.Error(ctx, "Background job failed", err) } } } @@ -510,10 +510,10 @@ func (n *network) initialize( return fmt.Errorf("Reloading network bootstrap: %w", err) } + n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute) + n.periodically( - n.logger.WithNamespace("reloadHosts"), - n.reloadHosts, - 3*time.Minute, + "removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute, ) return nil @@ -531,7 +531,7 @@ func (n *network) postChildrenInit( thisHost := n.currBootstrap.ThisHost() - if len(prevThisHost.Garage.Instances)+len(thisHost.Garage.Instances) > 0 { + if len(thisHost.Garage.Instances) > 0 { n.logger.Info(ctx, "Applying garage layout") if err := garageApplyLayout( ctx, @@ -618,6 +618,77 @@ func (n *network) reloadHosts(ctx context.Context) error { return nil } +// In general each host will manage the garage cluster layout of its own storage +// allocations via garageApplyLayout. There are three situations which are +// handled here, rather than garageApplyLayout: +// +// - A host removes all of its allocations via SetConfig. +// - A host removes all of its allocations by calling Load with no allocations +// in the provided daecommon.NetworkConfig. +// - A host is removed from the network by another host. +// +// In all of these cases the host no longer has any garage instances running, +// and so can't call garageApplyLayout on itself. To combat this we have all +// hosts which do have garage instances running periodically check that there's +// not some garage nodes orphaned in the cluster layout, and remove them if so. +func (n *network) removeOrphanGarageNodes(ctx context.Context) error { + n.l.RLock() + defer n.l.RUnlock() + + thisHost := n.currBootstrap.ThisHost() + if len(thisHost.Garage.Instances) == 0 { + n.logger.Info(ctx, "No local garage instances, cannot remove orphans") + return nil + } + + adminClient := newGarageAdminClient( + n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost, + ) + defer adminClient.Close() + + clusterStatus, err := adminClient.Status(ctx) + if err != nil { + return fmt.Errorf("retrieving garage cluster status: %w", err) + } + + buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost) + if len(buddyNodes) == 0 { + return nil + } + + ctx = mctx.Annotate(ctx, "buddyIP", buddyIP) + + for _, host := range n.currBootstrap.Hosts { + if host.IP() != buddyIP { + continue + } else if len(host.Garage.Instances) > 0 { + n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing") + return nil + } + break + } + + // Either the host is no longer in the network, or it no longer has any + // garage instances set on it. Either way, remove its nodes from the cluster + // layout. + + buddyNodeIDs := make([]string, len(buddyNodes)) + for i, buddyNode := range buddyNodes { + buddyNodeIDs[i] = buddyNode.ID + } + + n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes") + if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil { + return fmt.Errorf( + "applying garage cluster layout, removing nodes %+v: %w", + buddyNodes, + err, + ) + } + + return nil +} + // returns the bootstrap prior to the reload being applied. func (n *network) reload( ctx context.Context, diff --git a/go/daemon/network/network_it_test.go b/go/daemon/network/network_it_test.go index 983a621..e8a12f5 100644 --- a/go/daemon/network/network_it_test.go +++ b/go/daemon/network/network_it_test.go @@ -54,19 +54,44 @@ func TestLoad(t *testing.T) { } func TestJoin(t *testing.T) { - var ( - h = newIntegrationHarness(t) - primus = h.createNetwork(t, "primus", nil) - secondus = h.joinNetwork(t, primus, "secondus", nil) - ) + t.Run("simple", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + secondus = h.joinNetwork(t, primus, "secondus", nil) + ) - primusHosts, err := primus.GetHosts(h.ctx) - assert.NoError(t, err) + primusHosts, err := primus.GetHosts(h.ctx) + assert.NoError(t, err) - secondusHosts, err := secondus.GetHosts(h.ctx) - assert.NoError(t, err) + secondusHosts, err := secondus.GetHosts(h.ctx) + assert.NoError(t, err) - assert.Equal(t, primusHosts, secondusHosts) + assert.Equal(t, primusHosts, secondusHosts) + }) + + t.Run("with alloc", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ + networkConfigOpts: &networkConfigOpts{ + numStorageAllocs: 1, + }, + }) + ) + + t.Log("reloading primus' hosts") + assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx)) + + primusHosts, err := primus.GetHosts(h.ctx) + assert.NoError(t, err) + + secondusHosts, err := secondus.GetHosts(h.ctx) + assert.NoError(t, err) + + assert.Equal(t, primusHosts, secondusHosts) + }) } func TestNetwork_GetConfig(t *testing.T) { @@ -189,5 +214,54 @@ func TestNetwork_SetConfig(t *testing.T) { assert.ElementsMatch(t, expRoles, layout.Roles) }) - // TODO a host having allocs but removing all of them + t.Run("remove all storage allocs", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ + networkConfigOpts: &networkConfigOpts{ + numStorageAllocs: 1, + }, + }) + networkConfig = secondus.getConfig(t) + + prevHost = secondus.getHostsByName(t)[secondus.hostName] + removedAlloc = networkConfig.Storage.Allocations[0] + removedRole = allocsToRoles( + secondus.hostName, prevHost.Garage.Instances, + )[0] + removedGarageInst = daecommon.BootstrapGarageHostForAlloc( + prevHost, removedAlloc, + ) + + primusGarageAdminClient = primus.garageAdminClient(t) + ) + + networkConfig.Storage.Allocations = nil + assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig)) + + t.Log("Checking that the Host information was updated") + newHostsByName := primus.getHostsByName(t) + newHost, ok := newHostsByName[secondus.hostName] + assert.True(t, ok) + + allocs := newHost.HostConfigured.Garage.Instances + assert.Len(t, allocs, 3) + assert.NotContains(t, allocs, removedGarageInst) + + t.Log("Checking that garage layout still contains the old allocation") + layout, err := primusGarageAdminClient.GetLayout(h.ctx) + assert.NoError(t, err) + assert.Contains(t, layout.Roles, removedRole) + + t.Log("Removing orphan garage nodes with primus") + assert.NoError( + t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx), + ) + + t.Log("Checking that garage layout no longer contains the old allocation") + layout, err = primusGarageAdminClient.GetLayout(h.ctx) + assert.NoError(t, err) + assert.NotContains(t, layout.Roles, removedRole) + }) } diff --git a/go/garage/admin_client.go b/go/garage/admin_client.go index 129394b..322483d 100644 --- a/go/garage/admin_client.go +++ b/go/garage/admin_client.go @@ -8,6 +8,7 @@ import ( "io" "net/http" "net/http/httputil" + "net/netip" "time" "dev.mediocregopher.com/mediocre-go-lib.git/mctx" @@ -158,38 +159,81 @@ func (c *AdminClient) do( return nil } +// KnownNode describes the fields of a known node in the cluster, as returned +// as part of [ClusterStatus]. +type KnownNode struct { + ID string `json:"id"` + Role *Role `json:"role"` + Addr netip.AddrPort `json:"addr"` + IsUp bool `json:"isUp"` + LastSeenSecsAgo int `json:"lastSeenSecsAgo"` + HostName string `json:"hostname"` +} + +// Role descibes a node's role in the garage cluster, i.e. what storage it is +// providing. +type Role struct { + ID string `json:"id"` + Capacity int `json:"capacity"` // Gb (SI units) + Zone string `json:"zone"` + Tags []string `json:"tags"` +} + +// ClusterLayout describes the layout of the cluster as a whole. +type ClusterLayout struct { + Version int `json:"version"` + Roles []Role `json:"roles"` + StagedRoleChanges []Role `json:"stagedRoleChanges"` +} + +// ClusterStatus is returned from the Status endpoint, describing the currently +// known state of the cluster. +type ClusterStatus struct { + Nodes []KnownNode `json:"nodes"` +} + +// Status returns the current state of the cluster. +func (c *AdminClient) Status(ctx context.Context) (ClusterStatus, error) { + // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes + var clusterStatus ClusterStatus + err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil) + return clusterStatus, err +} + // Wait will block until the instance connected to can see at least // ReplicationFactor other garage instances. If the context is canceled it // will return the context error. func (c *AdminClient) Wait(ctx context.Context) error { for first := true; ; first = false { - if !first { - time.Sleep(250 * time.Millisecond) + select { + case <-time.After(2 * time.Second): + case <-ctx.Done(): + return ctx.Err() + } } - // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes - var clusterStatus struct { - Nodes []struct { - IsUp bool `json:"isUp"` - } `json:"nodes"` - } - - err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil) - + c.logger.Debug(ctx, "Getting cluster status") + clusterStatus, err := c.Status(ctx) if ctxErr := ctx.Err(); ctxErr != nil { return ctxErr } else if err != nil { - c.logger.Warn(ctx, "waiting for instance to become ready", err) + ctx := mctx.Annotate(ctx, "errMsg", err.Error()) + c.logger.Info(ctx, "Instance is not online yet") continue } var numUp int for _, node := range clusterStatus.Nodes { - if node.IsUp { + // There seems to be some kind of step between IsUp becoming true + // and garage actually loading the full state of a node, so we check + // for the HostName as well. We could also use LastSeenSecsAgo, but + // that remains null for the node being queried so it's more + // annoying to use. + if node.IsUp && node.HostName != "" { numUp++ } } @@ -204,7 +248,7 @@ func (c *AdminClient) Wait(ctx context.Context) error { return nil } - c.logger.Debug(ctx, "instance not online yet, will continue waiting") + c.logger.Info(ctx, "Instance is not joined to the cluster yet") } } @@ -283,20 +327,6 @@ func (c *AdminClient) GrantBucketPermissions( }) } -// Role descibes a node's role in the garage cluster, i.e. what storage it is -// providing. -type Role struct { - ID string `json:"id"` - Capacity int `json:"capacity"` // Gb (SI units) - Zone string `json:"zone"` - Tags []string `json:"tags"` -} - -// ClusterLayout describes the layout of the cluster as a whole. -type ClusterLayout struct { - Roles []Role `json:"roles"` -} - // GetLayout returns the currently applied ClusterLayout. func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) { // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout @@ -323,13 +353,7 @@ func (c *AdminClient) ApplyLayout( roles = append(roles, removeRole{ID: id, Remove: true}) } - // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout - var clusterLayout struct { - Version int `json:"version"` - StagedRoleChanges []Role `json:"stagedRoleChanges"` - } - - // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout + var clusterLayout ClusterLayout err := c.do(ctx, &clusterLayout, "POST", "/v1/layout", roles) if err != nil { return fmt.Errorf("staging layout changes: %w", err) @@ -337,7 +361,6 @@ func (c *AdminClient) ApplyLayout( return nil } - // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout applyClusterLayout := struct { Version int `json:"version"` }{