Have hosts update garage cluster layout to remove other nodes if necessary

This commit is contained in:
Brian Picciano 2024-11-05 22:31:57 +01:00
parent 53a06af9ba
commit 1f5286dbb4
6 changed files with 306 additions and 62 deletions

View File

@ -1,8 +1,8 @@
# Contributing Storage
If your host machine can be reasonably sure of being online most, if not all, of
the time, and has 1GB or more of unused drive space you'd like to contribute to
the network, then this document is for you.
This document is for you if your host machine can be reliably be online at all
times and has 1GB or more of unused drive space you'd like to contribute to the
network.
## Edit `daemon.yml`
@ -16,7 +16,7 @@ one allocation listed.
The comments in the file should be self-explanatory, but ask your admin if you
need any clarification.
Here are an example set of allocations for a host which is contributing space
Here is an example set of allocations for a host which is contributing space
from two separate drives:
```
@ -47,6 +47,17 @@ process.
The `isle daemon` will automatically allow the ports used for your
storage allocations in the vpn firewall.
## Removing Allocations
If you later decide to no longer provide storage simply remove the
`storage.allocations` item from your `/etc/isle/daemon.yml` file and restart the
`isle daemon` process.
Once removed, it is advisable to wait some time before turning off the daemon or
deleting the data from the data and meta directories which were previously
allocated to. This gives other hosts in the network time to offload all data
that yours was previously hosting.
## Further Reading
Isle uses the [garage][garage] project for its storage system. See the

View File

@ -184,7 +184,6 @@ func (c *Children) reloadNebula(
return nil
}
// TODO this doesn't handle removing garage nodes
func (c *Children) reloadGarage(
ctx context.Context,
networkConfig daecommon.NetworkConfig,
@ -206,6 +205,8 @@ func (c *Children) reloadGarage(
)
)
// TODO it's possible that the config changed, but only the bootstrap
// peers, in which case we don't need to restart the node.
childConfigPath, changed, err := garageWriteChildConfig(
ctx,
c.logger,

View File

@ -12,13 +12,16 @@ import (
"isle/nebula"
"isle/secrets"
"net"
"net/netip"
"path/filepath"
"slices"
"strconv"
"time"
"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
"dev.mediocregopher.com/mediocre-go-lib.git/mlog"
"github.com/minio/minio-go/v7"
"golang.org/x/exp/maps"
)
// Paths within garage's global bucket.
@ -341,3 +344,59 @@ func garageWaitForAlloc(
return nil
}
}
// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
// the given garage cluster status. It will return zero values if the host has
// no buddy.
//
// For situations where we want one host to affect the cluster layout of another
// host's peers, we use a simple system to determine a single host which is
// responsible. The goal is not to be 100% race-proof (garage handles that), but
// rather to try to prevent all hosts from modifying the same host's layout at
// the same time.
//
// The system is to order all hosts by their IP, and say that each host is
// responsible for (aka the "buddy" of) the host immediately after their own in
// that list. The last host in that list is responsible for the first.
func garageNodeBuddyPeers(
status garage.ClusterStatus, host bootstrap.Host,
) (
netip.Addr, []garage.Role,
) {
var (
thisIP = host.IP()
nodeRolesByIP = map[netip.Addr][]garage.Role{}
)
for _, node := range status.Nodes {
if node.Role == nil {
continue
}
ip := node.Addr.Addr()
nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role)
}
// If there is only a single host in the cluster (or, somehow, none) then
// that host has no buddy.
if len(nodeRolesByIP) < 2 {
return netip.Addr{}, nil
}
nodeIPs := maps.Keys(nodeRolesByIP)
slices.SortFunc(nodeIPs, netip.Addr.Compare)
for i, nodeIP := range nodeIPs {
var buddyIP netip.Addr
if i == len(nodeIPs)-1 {
buddyIP = nodeIPs[0]
} else if nodeIP == thisIP {
buddyIP = nodeIPs[i+1]
} else {
continue
}
return buddyIP, nodeRolesByIP[buddyIP]
}
panic("Unreachable")
}

View File

@ -419,7 +419,7 @@ func (n *network) initializeDirs(mayExist bool) error {
}
func (n *network) periodically(
logger *mlog.Logger,
label string,
fn func(context.Context) error,
period time.Duration,
) {
@ -427,13 +427,13 @@ func (n *network) periodically(
go func() {
defer n.wg.Done()
ctx := mctx.Annotate(n.workerCtx, "period", period)
ctx := mctx.Annotate(n.workerCtx, "workerLabel", label)
ticker := time.NewTicker(period)
defer ticker.Stop()
logger.Info(ctx, "Starting background job runner")
defer logger.Info(ctx, "Stopping background job runner")
n.logger.Info(ctx, "Starting background job runner")
defer n.logger.Info(ctx, "Stopping background job runner")
for {
select {
@ -441,9 +441,9 @@ func (n *network) periodically(
return
case <-ticker.C:
logger.Info(ctx, "Background job running")
n.logger.Info(ctx, "Background job running")
if err := fn(ctx); err != nil {
logger.Error(ctx, "Background job failed", err)
n.logger.Error(ctx, "Background job failed", err)
}
}
}
@ -510,10 +510,10 @@ func (n *network) initialize(
return fmt.Errorf("Reloading network bootstrap: %w", err)
}
n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)
n.periodically(
n.logger.WithNamespace("reloadHosts"),
n.reloadHosts,
3*time.Minute,
"removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
)
return nil
@ -531,7 +531,7 @@ func (n *network) postChildrenInit(
thisHost := n.currBootstrap.ThisHost()
if len(prevThisHost.Garage.Instances)+len(thisHost.Garage.Instances) > 0 {
if len(thisHost.Garage.Instances) > 0 {
n.logger.Info(ctx, "Applying garage layout")
if err := garageApplyLayout(
ctx,
@ -618,6 +618,77 @@ func (n *network) reloadHosts(ctx context.Context) error {
return nil
}
// In general each host will manage the garage cluster layout of its own storage
// allocations via garageApplyLayout. There are three situations which are
// handled here, rather than garageApplyLayout:
//
// - A host removes all of its allocations via SetConfig.
// - A host removes all of its allocations by calling Load with no allocations
// in the provided daecommon.NetworkConfig.
// - A host is removed from the network by another host.
//
// In all of these cases the host no longer has any garage instances running,
// and so can't call garageApplyLayout on itself. To combat this we have all
// hosts which do have garage instances running periodically check that there's
// not some garage nodes orphaned in the cluster layout, and remove them if so.
func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
n.l.RLock()
defer n.l.RUnlock()
thisHost := n.currBootstrap.ThisHost()
if len(thisHost.Garage.Instances) == 0 {
n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
return nil
}
adminClient := newGarageAdminClient(
n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
)
defer adminClient.Close()
clusterStatus, err := adminClient.Status(ctx)
if err != nil {
return fmt.Errorf("retrieving garage cluster status: %w", err)
}
buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
if len(buddyNodes) == 0 {
return nil
}
ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
for _, host := range n.currBootstrap.Hosts {
if host.IP() != buddyIP {
continue
} else if len(host.Garage.Instances) > 0 {
n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
return nil
}
break
}
// Either the host is no longer in the network, or it no longer has any
// garage instances set on it. Either way, remove its nodes from the cluster
// layout.
buddyNodeIDs := make([]string, len(buddyNodes))
for i, buddyNode := range buddyNodes {
buddyNodeIDs[i] = buddyNode.ID
}
n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
return fmt.Errorf(
"applying garage cluster layout, removing nodes %+v: %w",
buddyNodes,
err,
)
}
return nil
}
// returns the bootstrap prior to the reload being applied.
func (n *network) reload(
ctx context.Context,

View File

@ -54,19 +54,44 @@ func TestLoad(t *testing.T) {
}
func TestJoin(t *testing.T) {
var (
h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil)
secondus = h.joinNetwork(t, primus, "secondus", nil)
)
t.Run("simple", func(t *testing.T) {
var (
h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil)
secondus = h.joinNetwork(t, primus, "secondus", nil)
)
primusHosts, err := primus.GetHosts(h.ctx)
assert.NoError(t, err)
primusHosts, err := primus.GetHosts(h.ctx)
assert.NoError(t, err)
secondusHosts, err := secondus.GetHosts(h.ctx)
assert.NoError(t, err)
secondusHosts, err := secondus.GetHosts(h.ctx)
assert.NoError(t, err)
assert.Equal(t, primusHosts, secondusHosts)
assert.Equal(t, primusHosts, secondusHosts)
})
t.Run("with alloc", func(t *testing.T) {
var (
h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil)
secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
networkConfigOpts: &networkConfigOpts{
numStorageAllocs: 1,
},
})
)
t.Log("reloading primus' hosts")
assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))
primusHosts, err := primus.GetHosts(h.ctx)
assert.NoError(t, err)
secondusHosts, err := secondus.GetHosts(h.ctx)
assert.NoError(t, err)
assert.Equal(t, primusHosts, secondusHosts)
})
}
func TestNetwork_GetConfig(t *testing.T) {
@ -189,5 +214,59 @@ func TestNetwork_SetConfig(t *testing.T) {
assert.ElementsMatch(t, expRoles, layout.Roles)
})
// TODO a host having allocs but removing all of them
t.Run("remove all storage allocs", func(t *testing.T) {
var (
h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil)
secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
networkConfigOpts: &networkConfigOpts{
numStorageAllocs: 1,
},
})
networkConfig = secondus.getConfig(t)
prevHost = secondus.getHostsByName(t)[secondus.hostName]
removedAlloc = networkConfig.Storage.Allocations[0]
removedRole = allocsToRoles(
secondus.hostName, prevHost.Garage.Instances,
)[0]
removedGarageInst = daecommon.BootstrapGarageHostForAlloc(
prevHost, removedAlloc,
)
primusGarageAdminClient = primus.garageAdminClient(t)
)
networkConfig.Storage.Allocations = nil
assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig))
t.Log("Checking that the Host information was updated")
newHostsByName := primus.getHostsByName(t)
newHost, ok := newHostsByName[secondus.hostName]
assert.True(t, ok)
allocs := newHost.HostConfigured.Garage.Instances
assert.Len(t, allocs, 3)
assert.NotContains(t, allocs, removedGarageInst)
t.Log("Checking that garage layout still contains the old allocation")
layout, err := primusGarageAdminClient.GetLayout(h.ctx)
assert.NoError(t, err)
assert.Contains(t, layout.Roles, removedRole)
t.Log("Removing orphan garage nodes with primus")
assert.NoError(
t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
)
t.Log("Checking that garage layout no longer contains the old allocation")
layout, err = primusGarageAdminClient.GetLayout(h.ctx)
assert.NoError(t, err)
assert.NotContains(t, layout.Roles, removedRole)
})
// TODO make sure that if two nodes each have 3, and one removes all 3, that
// the other is able to retain availability of all data. This means that the
// garage nodes on the node which removed all allocs need to stay online
// some time, in order to replicate data to the leftover nodes.
}

View File

@ -8,6 +8,7 @@ import (
"io"
"net/http"
"net/http/httputil"
"net/netip"
"time"
"dev.mediocregopher.com/mediocre-go-lib.git/mctx"
@ -158,38 +159,81 @@ func (c *AdminClient) do(
return nil
}
// KnownNode describes the fields of a known node in the cluster, as returned
// as part of [ClusterStatus].
type KnownNode struct {
ID string `json:"id"`
Role *Role `json:"role"`
Addr netip.AddrPort `json:"addr"`
IsUp bool `json:"isUp"`
LastSeenSecsAgo int `json:"lastSeenSecsAgo"`
HostName string `json:"hostname"`
}
// Role descibes a node's role in the garage cluster, i.e. what storage it is
// providing.
type Role struct {
ID string `json:"id"`
Capacity int `json:"capacity"` // Gb (SI units)
Zone string `json:"zone"`
Tags []string `json:"tags"`
}
// ClusterLayout describes the layout of the cluster as a whole.
type ClusterLayout struct {
Version int `json:"version"`
Roles []Role `json:"roles"`
StagedRoleChanges []Role `json:"stagedRoleChanges"`
}
// ClusterStatus is returned from the Status endpoint, describing the currently
// known state of the cluster.
type ClusterStatus struct {
Nodes []KnownNode `json:"nodes"`
}
// Status returns the current state of the cluster.
func (c *AdminClient) Status(ctx context.Context) (ClusterStatus, error) {
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
var clusterStatus ClusterStatus
err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
return clusterStatus, err
}
// Wait will block until the instance connected to can see at least
// ReplicationFactor other garage instances. If the context is canceled it
// will return the context error.
func (c *AdminClient) Wait(ctx context.Context) error {
for first := true; ; first = false {
if !first {
time.Sleep(250 * time.Millisecond)
select {
case <-time.After(2 * time.Second):
case <-ctx.Done():
return ctx.Err()
}
}
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
var clusterStatus struct {
Nodes []struct {
IsUp bool `json:"isUp"`
} `json:"nodes"`
}
err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
c.logger.Debug(ctx, "Getting cluster status")
clusterStatus, err := c.Status(ctx)
if ctxErr := ctx.Err(); ctxErr != nil {
return ctxErr
} else if err != nil {
c.logger.Warn(ctx, "waiting for instance to become ready", err)
ctx := mctx.Annotate(ctx, "errMsg", err.Error())
c.logger.Info(ctx, "Instance is not online yet")
continue
}
var numUp int
for _, node := range clusterStatus.Nodes {
if node.IsUp {
// There seems to be some kind of step between IsUp becoming true
// and garage actually loading the full state of a node, so we check
// for the HostName as well. We could also use LastSeenSecsAgo, but
// that remains null for the node being queried so it's more
// annoying to use.
if node.IsUp && node.HostName != "" {
numUp++
}
}
@ -204,7 +248,7 @@ func (c *AdminClient) Wait(ctx context.Context) error {
return nil
}
c.logger.Debug(ctx, "instance not online yet, will continue waiting")
c.logger.Info(ctx, "Instance is not joined to the cluster yet")
}
}
@ -283,20 +327,6 @@ func (c *AdminClient) GrantBucketPermissions(
})
}
// Role descibes a node's role in the garage cluster, i.e. what storage it is
// providing.
type Role struct {
ID string `json:"id"`
Capacity int `json:"capacity"` // Gb (SI units)
Zone string `json:"zone"`
Tags []string `json:"tags"`
}
// ClusterLayout describes the layout of the cluster as a whole.
type ClusterLayout struct {
Roles []Role `json:"roles"`
}
// GetLayout returns the currently applied ClusterLayout.
func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) {
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout
@ -323,13 +353,7 @@ func (c *AdminClient) ApplyLayout(
roles = append(roles, removeRole{ID: id, Remove: true})
}
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout
var clusterLayout struct {
Version int `json:"version"`
StagedRoleChanges []Role `json:"stagedRoleChanges"`
}
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
var clusterLayout ClusterLayout
err := c.do(ctx, &clusterLayout, "POST", "/v1/layout", roles)
if err != nil {
return fmt.Errorf("staging layout changes: %w", err)
@ -337,7 +361,6 @@ func (c *AdminClient) ApplyLayout(
return nil
}
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
applyClusterLayout := struct {
Version int `json:"version"`
}{