diff --git a/go/daemon/children/children.go b/go/daemon/children/children.go index 5642ca7..6e00c2d 100644 --- a/go/daemon/children/children.go +++ b/go/daemon/children/children.go @@ -184,7 +184,6 @@ func (c *Children) reloadNebula( return nil } -// TODO this doesn't handle removing garage nodes func (c *Children) reloadGarage( ctx context.Context, networkConfig daecommon.NetworkConfig, @@ -206,6 +205,8 @@ func (c *Children) reloadGarage( ) ) + // TODO it's possible that the config changed, but only the bootstrap + // peers, in which case we don't need to restart the node. childConfigPath, changed, err := garageWriteChildConfig( ctx, c.logger, diff --git a/go/daemon/network/garage.go b/go/daemon/network/garage.go index faa18ab..23c4aef 100644 --- a/go/daemon/network/garage.go +++ b/go/daemon/network/garage.go @@ -12,13 +12,16 @@ import ( "isle/nebula" "isle/secrets" "net" + "net/netip" "path/filepath" + "slices" "strconv" "time" "dev.mediocregopher.com/mediocre-go-lib.git/mctx" "dev.mediocregopher.com/mediocre-go-lib.git/mlog" "github.com/minio/minio-go/v7" + "golang.org/x/exp/maps" ) // Paths within garage's global bucket. @@ -341,3 +344,67 @@ func garageWaitForAlloc( return nil } } + +// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on +// the given garage cluster status. It will return zero values if the host has +// no buddy. +// +// For situations where we want one host to affect the cluster layout of another +// host's peers, we use a simple system to determine a single host which is +// responsible. The goal is not to be 100% race-proof (garage handles that), but +// rather to try to prevent all hosts from modifying the same host's layout at +// the same time. +// +// The system is to order all hosts by their IP, and say that each host is +// responsible for (aka the "buddy" of) the host immediately after their own in +// that list. The last host in that list is responsible for the first. +func garageNodeBuddyPeers( + status garage.ClusterStatus, host bootstrap.Host, +) ( + netip.Addr, []garage.Role, +) { + var ( + thisIP = host.IP() + rolesByID = make( + map[string]garage.Role, len(status.Layout.Roles), + ) + nodeRolesByIP = map[netip.Addr][]garage.Role{} + ) + + for _, role := range status.Layout.Roles { + rolesByID[role.ID] = role + } + + for _, node := range status.Nodes { + role, ok := rolesByID[node.ID] + if !ok { + continue + } + + ip := node.Addr.Addr() + nodeRolesByIP[ip] = append(nodeRolesByIP[ip], role) + } + + // If there is only a single host in the cluster (or, somehow, none) then + // that host has no buddy. + if len(nodeRolesByIP) < 2 { + return netip.Addr{}, nil + } + + nodeIPs := maps.Keys(nodeRolesByIP) + slices.SortFunc(nodeIPs, netip.Addr.Compare) + + for i, nodeIP := range nodeIPs { + var buddyIP netip.Addr + if i == len(nodeIPs)-1 { + buddyIP = nodeIPs[0] + } else if nodeIP == thisIP { + buddyIP = nodeIPs[i+1] + } else { + continue + } + return buddyIP, nodeRolesByIP[buddyIP] + } + + panic("Unreachable") +} diff --git a/go/daemon/network/network.go b/go/daemon/network/network.go index d311762..5c85b48 100644 --- a/go/daemon/network/network.go +++ b/go/daemon/network/network.go @@ -419,7 +419,7 @@ func (n *network) initializeDirs(mayExist bool) error { } func (n *network) periodically( - logger *mlog.Logger, + label string, fn func(context.Context) error, period time.Duration, ) { @@ -427,13 +427,13 @@ func (n *network) periodically( go func() { defer n.wg.Done() - ctx := mctx.Annotate(n.workerCtx, "period", period) + ctx := mctx.Annotate(n.workerCtx, "workerLabel", label) ticker := time.NewTicker(period) defer ticker.Stop() - logger.Info(ctx, "Starting background job runner") - defer logger.Info(ctx, "Stopping background job runner") + n.logger.Info(ctx, "Starting background job runner") + defer n.logger.Info(ctx, "Stopping background job runner") for { select { @@ -441,9 +441,9 @@ func (n *network) periodically( return case <-ticker.C: - logger.Info(ctx, "Background job running") + n.logger.Info(ctx, "Background job running") if err := fn(ctx); err != nil { - logger.Error(ctx, "Background job failed", err) + n.logger.Error(ctx, "Background job failed", err) } } } @@ -510,10 +510,10 @@ func (n *network) initialize( return fmt.Errorf("Reloading network bootstrap: %w", err) } + n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute) + n.periodically( - n.logger.WithNamespace("reloadHosts"), - n.reloadHosts, - 3*time.Minute, + "removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute, ) return nil @@ -531,7 +531,7 @@ func (n *network) postChildrenInit( thisHost := n.currBootstrap.ThisHost() - if len(prevThisHost.Garage.Instances)+len(thisHost.Garage.Instances) > 0 { + if len(thisHost.Garage.Instances) > 0 { n.logger.Info(ctx, "Applying garage layout") if err := garageApplyLayout( ctx, @@ -618,6 +618,77 @@ func (n *network) reloadHosts(ctx context.Context) error { return nil } +// In general each host will manage the garage cluster layout of its own storage +// allocations via garageApplyLayout. There are three situations which are +// handled here, rather than garageApplyLayout: +// +// - A host removes all of its allocations via SetConfig. +// - A host removes all of its allocations by calling Load with no allocations +// in the provided daecommon.NetworkConfig. +// - A host is removed from the network by another host. +// +// In all of these cases the host no longer has any garage instances running, +// and so can't call garageApplyLayout on itself. To combat this we have all +// hosts which do have garage instances running periodically check that there's +// not some garage nodes orphaned in the cluster layout, and remove them if so. +func (n *network) removeOrphanGarageNodes(ctx context.Context) error { + n.l.RLock() + defer n.l.RUnlock() + + thisHost := n.currBootstrap.ThisHost() + if len(thisHost.Garage.Instances) == 0 { + n.logger.Info(ctx, "No local garage instances, cannot remove orphans") + return nil + } + + adminClient := newGarageAdminClient( + n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost, + ) + defer adminClient.Close() + + clusterStatus, err := adminClient.Status(ctx) + if err != nil { + return fmt.Errorf("retrieving garage cluster status: %w", err) + } + + buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost) + if len(buddyNodes) == 0 { + return nil + } + + ctx = mctx.Annotate(ctx, "buddyIP", buddyIP) + + for _, host := range n.currBootstrap.Hosts { + if host.IP() != buddyIP { + continue + } else if len(host.Garage.Instances) > 0 { + n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing") + return nil + } + break + } + + // Either the host is no longer in the network, or it no longer has any + // garage instances set on it. Either way, remove its nodes from the cluster + // layout. + + buddyNodeIDs := make([]string, len(buddyNodes)) + for i, buddyNode := range buddyNodes { + buddyNodeIDs[i] = buddyNode.ID + } + + n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes") + if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil { + return fmt.Errorf( + "applying garage cluster layout, removing nodes %+v: %w", + buddyNodes, + err, + ) + } + + return nil +} + // returns the bootstrap prior to the reload being applied. func (n *network) reload( ctx context.Context, diff --git a/go/daemon/network/network_it_test.go b/go/daemon/network/network_it_test.go index 983a621..104c03b 100644 --- a/go/daemon/network/network_it_test.go +++ b/go/daemon/network/network_it_test.go @@ -54,19 +54,44 @@ func TestLoad(t *testing.T) { } func TestJoin(t *testing.T) { - var ( - h = newIntegrationHarness(t) - primus = h.createNetwork(t, "primus", nil) - secondus = h.joinNetwork(t, primus, "secondus", nil) - ) + t.Run("simple", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + secondus = h.joinNetwork(t, primus, "secondus", nil) + ) - primusHosts, err := primus.GetHosts(h.ctx) - assert.NoError(t, err) + primusHosts, err := primus.GetHosts(h.ctx) + assert.NoError(t, err) - secondusHosts, err := secondus.GetHosts(h.ctx) - assert.NoError(t, err) + secondusHosts, err := secondus.GetHosts(h.ctx) + assert.NoError(t, err) - assert.Equal(t, primusHosts, secondusHosts) + assert.Equal(t, primusHosts, secondusHosts) + }) + + t.Run("with alloc", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ + networkConfigOpts: &networkConfigOpts{ + numStorageAllocs: 1, + }, + }) + ) + + t.Log("reloading primus' hosts") + assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx)) + + primusHosts, err := primus.GetHosts(h.ctx) + assert.NoError(t, err) + + secondusHosts, err := secondus.GetHosts(h.ctx) + assert.NoError(t, err) + + assert.Equal(t, primusHosts, secondusHosts) + }) } func TestNetwork_GetConfig(t *testing.T) { @@ -189,5 +214,33 @@ func TestNetwork_SetConfig(t *testing.T) { assert.ElementsMatch(t, expRoles, layout.Roles) }) - // TODO a host having allocs but removing all of them + t.Run("remove all storage allocs", func(t *testing.T) { + var ( + h = newIntegrationHarness(t) + primus = h.createNetwork(t, "primus", nil) + secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{ + networkConfigOpts: &networkConfigOpts{ + numStorageAllocs: 1, + }, + }) + networkConfig = secondus.getConfig(t) + + prevHost = secondus.getHostsByName(t)[secondus.hostName] + //removedAlloc = networkConfig.Storage.Allocations[0] + removedRole = allocsToRoles( + secondus.hostName, prevHost.Garage.Instances, + )[0] + //removedGarageInst = daecommon.BootstrapGarageHostForAlloc( + // prevHost, removedAlloc, + //) + ) + + networkConfig.Storage.Allocations = nil + assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig)) + + t.Log("Checking that garage layout still contains the old allocation") + layout, err := secondus.garageAdminClient(t).GetLayout(h.ctx) + assert.NoError(t, err) + assert.Contains(t, layout.Roles, removedRole) + }) } diff --git a/go/garage/admin_client.go b/go/garage/admin_client.go index 129394b..259b4d0 100644 --- a/go/garage/admin_client.go +++ b/go/garage/admin_client.go @@ -8,6 +8,7 @@ import ( "io" "net/http" "net/http/httputil" + "net/netip" "time" "dev.mediocregopher.com/mediocre-go-lib.git/mctx" @@ -158,6 +159,45 @@ func (c *AdminClient) do( return nil } +// KnownNode describes the fields of a known node in the cluster, as returned +// as part of [ClusterStatus]. +type KnownNode struct { + ID string `json:"id"` + Addr netip.AddrPort `json:"addr"` + IsUp bool `json:"isUp"` + LastSeenSecsAgo int `json:"lastSeenSecsAgo"` + HostName string `json:"hostname"` +} + +// Role descibes a node's role in the garage cluster, i.e. what storage it is +// providing. +type Role struct { + ID string `json:"id"` + Capacity int `json:"capacity"` // Gb (SI units) + Zone string `json:"zone"` + Tags []string `json:"tags"` +} + +// ClusterLayout describes the layout of the cluster as a whole. +type ClusterLayout struct { + Roles []Role `json:"roles"` +} + +// ClusterStatus is returned from the Status endpoint, describing the currently +// known state of the cluster. +type ClusterStatus struct { + Nodes []KnownNode `json:"nodes"` + Layout ClusterLayout `json:"layout"` +} + +// Status returns the current state of the cluster. +func (c *AdminClient) Status(ctx context.Context) (ClusterStatus, error) { + // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes + var clusterStatus ClusterStatus + err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil) + return clusterStatus, err +} + // Wait will block until the instance connected to can see at least // ReplicationFactor other garage instances. If the context is canceled it // will return the context error. @@ -166,30 +206,32 @@ func (c *AdminClient) Wait(ctx context.Context) error { for first := true; ; first = false { if !first { - time.Sleep(250 * time.Millisecond) + select { + case <-time.After(500 * time.Millisecond): + case <-ctx.Done(): + return ctx.Err() + } } - // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes - var clusterStatus struct { - Nodes []struct { - IsUp bool `json:"isUp"` - } `json:"nodes"` - } - - err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil) - + c.logger.Debug(ctx, "Getting cluster status") + clusterStatus, err := c.Status(ctx) if ctxErr := ctx.Err(); ctxErr != nil { return ctxErr } else if err != nil { - c.logger.Warn(ctx, "waiting for instance to become ready", err) + c.logger.Warn(ctx, "Instance is not yet ready", err) continue } var numUp int for _, node := range clusterStatus.Nodes { - if node.IsUp { + // There seems to be some kind of step between IsUp becoming true + // and garage actually loading the full state of a node, so we check + // for the HostName as well. We could also use LastSeenSecsAgo, but + // that remains null for the node being queried so it's more + // annoying to use. + if node.IsUp && node.HostName != "" { numUp++ } } @@ -204,7 +246,7 @@ func (c *AdminClient) Wait(ctx context.Context) error { return nil } - c.logger.Debug(ctx, "instance not online yet, will continue waiting") + c.logger.Debug(ctx, "Instance not online yet, will continue waiting") } } @@ -283,20 +325,6 @@ func (c *AdminClient) GrantBucketPermissions( }) } -// Role descibes a node's role in the garage cluster, i.e. what storage it is -// providing. -type Role struct { - ID string `json:"id"` - Capacity int `json:"capacity"` // Gb (SI units) - Zone string `json:"zone"` - Tags []string `json:"tags"` -} - -// ClusterLayout describes the layout of the cluster as a whole. -type ClusterLayout struct { - Roles []Role `json:"roles"` -} - // GetLayout returns the currently applied ClusterLayout. func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) { // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout