Have hosts update garage cluster layout to remove other nodes if necessary

This commit is contained in:
Brian Picciano 2024-11-05 22:31:57 +01:00
parent 53a06af9ba
commit 1f5286dbb4
6 changed files with 306 additions and 62 deletions

View File

@ -1,8 +1,8 @@
# Contributing Storage # Contributing Storage
If your host machine can be reasonably sure of being online most, if not all, of This document is for you if your host machine can be reliably be online at all
the time, and has 1GB or more of unused drive space you'd like to contribute to times and has 1GB or more of unused drive space you'd like to contribute to the
the network, then this document is for you. network.
## Edit `daemon.yml` ## Edit `daemon.yml`
@ -16,7 +16,7 @@ one allocation listed.
The comments in the file should be self-explanatory, but ask your admin if you The comments in the file should be self-explanatory, but ask your admin if you
need any clarification. need any clarification.
Here are an example set of allocations for a host which is contributing space Here is an example set of allocations for a host which is contributing space
from two separate drives: from two separate drives:
``` ```
@ -47,6 +47,17 @@ process.
The `isle daemon` will automatically allow the ports used for your The `isle daemon` will automatically allow the ports used for your
storage allocations in the vpn firewall. storage allocations in the vpn firewall.
## Removing Allocations
If you later decide to no longer provide storage simply remove the
`storage.allocations` item from your `/etc/isle/daemon.yml` file and restart the
`isle daemon` process.
Once removed, it is advisable to wait some time before turning off the daemon or
deleting the data from the data and meta directories which were previously
allocated to. This gives other hosts in the network time to offload all data
that yours was previously hosting.
## Further Reading ## Further Reading
Isle uses the [garage][garage] project for its storage system. See the Isle uses the [garage][garage] project for its storage system. See the

View File

@ -184,7 +184,6 @@ func (c *Children) reloadNebula(
return nil return nil
} }
// TODO this doesn't handle removing garage nodes
func (c *Children) reloadGarage( func (c *Children) reloadGarage(
ctx context.Context, ctx context.Context,
networkConfig daecommon.NetworkConfig, networkConfig daecommon.NetworkConfig,
@ -206,6 +205,8 @@ func (c *Children) reloadGarage(
) )
) )
// TODO it's possible that the config changed, but only the bootstrap
// peers, in which case we don't need to restart the node.
childConfigPath, changed, err := garageWriteChildConfig( childConfigPath, changed, err := garageWriteChildConfig(
ctx, ctx,
c.logger, c.logger,

View File

@ -12,13 +12,16 @@ import (
"isle/nebula" "isle/nebula"
"isle/secrets" "isle/secrets"
"net" "net"
"net/netip"
"path/filepath" "path/filepath"
"slices"
"strconv" "strconv"
"time" "time"
"dev.mediocregopher.com/mediocre-go-lib.git/mctx" "dev.mediocregopher.com/mediocre-go-lib.git/mctx"
"dev.mediocregopher.com/mediocre-go-lib.git/mlog" "dev.mediocregopher.com/mediocre-go-lib.git/mlog"
"github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7"
"golang.org/x/exp/maps"
) )
// Paths within garage's global bucket. // Paths within garage's global bucket.
@ -341,3 +344,59 @@ func garageWaitForAlloc(
return nil return nil
} }
} }
// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
// the given garage cluster status. It will return zero values if the host has
// no buddy.
//
// For situations where we want one host to affect the cluster layout of another
// host's peers, we use a simple system to determine a single host which is
// responsible. The goal is not to be 100% race-proof (garage handles that), but
// rather to try to prevent all hosts from modifying the same host's layout at
// the same time.
//
// The system is to order all hosts by their IP, and say that each host is
// responsible for (aka the "buddy" of) the host immediately after their own in
// that list. The last host in that list is responsible for the first.
func garageNodeBuddyPeers(
status garage.ClusterStatus, host bootstrap.Host,
) (
netip.Addr, []garage.Role,
) {
var (
thisIP = host.IP()
nodeRolesByIP = map[netip.Addr][]garage.Role{}
)
for _, node := range status.Nodes {
if node.Role == nil {
continue
}
ip := node.Addr.Addr()
nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role)
}
// If there is only a single host in the cluster (or, somehow, none) then
// that host has no buddy.
if len(nodeRolesByIP) < 2 {
return netip.Addr{}, nil
}
nodeIPs := maps.Keys(nodeRolesByIP)
slices.SortFunc(nodeIPs, netip.Addr.Compare)
for i, nodeIP := range nodeIPs {
var buddyIP netip.Addr
if i == len(nodeIPs)-1 {
buddyIP = nodeIPs[0]
} else if nodeIP == thisIP {
buddyIP = nodeIPs[i+1]
} else {
continue
}
return buddyIP, nodeRolesByIP[buddyIP]
}
panic("Unreachable")
}

View File

@ -419,7 +419,7 @@ func (n *network) initializeDirs(mayExist bool) error {
} }
func (n *network) periodically( func (n *network) periodically(
logger *mlog.Logger, label string,
fn func(context.Context) error, fn func(context.Context) error,
period time.Duration, period time.Duration,
) { ) {
@ -427,13 +427,13 @@ func (n *network) periodically(
go func() { go func() {
defer n.wg.Done() defer n.wg.Done()
ctx := mctx.Annotate(n.workerCtx, "period", period) ctx := mctx.Annotate(n.workerCtx, "workerLabel", label)
ticker := time.NewTicker(period) ticker := time.NewTicker(period)
defer ticker.Stop() defer ticker.Stop()
logger.Info(ctx, "Starting background job runner") n.logger.Info(ctx, "Starting background job runner")
defer logger.Info(ctx, "Stopping background job runner") defer n.logger.Info(ctx, "Stopping background job runner")
for { for {
select { select {
@ -441,9 +441,9 @@ func (n *network) periodically(
return return
case <-ticker.C: case <-ticker.C:
logger.Info(ctx, "Background job running") n.logger.Info(ctx, "Background job running")
if err := fn(ctx); err != nil { if err := fn(ctx); err != nil {
logger.Error(ctx, "Background job failed", err) n.logger.Error(ctx, "Background job failed", err)
} }
} }
} }
@ -510,10 +510,10 @@ func (n *network) initialize(
return fmt.Errorf("Reloading network bootstrap: %w", err) return fmt.Errorf("Reloading network bootstrap: %w", err)
} }
n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)
n.periodically( n.periodically(
n.logger.WithNamespace("reloadHosts"), "removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
n.reloadHosts,
3*time.Minute,
) )
return nil return nil
@ -531,7 +531,7 @@ func (n *network) postChildrenInit(
thisHost := n.currBootstrap.ThisHost() thisHost := n.currBootstrap.ThisHost()
if len(prevThisHost.Garage.Instances)+len(thisHost.Garage.Instances) > 0 { if len(thisHost.Garage.Instances) > 0 {
n.logger.Info(ctx, "Applying garage layout") n.logger.Info(ctx, "Applying garage layout")
if err := garageApplyLayout( if err := garageApplyLayout(
ctx, ctx,
@ -618,6 +618,77 @@ func (n *network) reloadHosts(ctx context.Context) error {
return nil return nil
} }
// In general each host will manage the garage cluster layout of its own storage
// allocations via garageApplyLayout. There are three situations which are
// handled here, rather than garageApplyLayout:
//
// - A host removes all of its allocations via SetConfig.
// - A host removes all of its allocations by calling Load with no allocations
// in the provided daecommon.NetworkConfig.
// - A host is removed from the network by another host.
//
// In all of these cases the host no longer has any garage instances running,
// and so can't call garageApplyLayout on itself. To combat this we have all
// hosts which do have garage instances running periodically check that there's
// not some garage nodes orphaned in the cluster layout, and remove them if so.
func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
n.l.RLock()
defer n.l.RUnlock()
thisHost := n.currBootstrap.ThisHost()
if len(thisHost.Garage.Instances) == 0 {
n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
return nil
}
adminClient := newGarageAdminClient(
n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
)
defer adminClient.Close()
clusterStatus, err := adminClient.Status(ctx)
if err != nil {
return fmt.Errorf("retrieving garage cluster status: %w", err)
}
buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
if len(buddyNodes) == 0 {
return nil
}
ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
for _, host := range n.currBootstrap.Hosts {
if host.IP() != buddyIP {
continue
} else if len(host.Garage.Instances) > 0 {
n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
return nil
}
break
}
// Either the host is no longer in the network, or it no longer has any
// garage instances set on it. Either way, remove its nodes from the cluster
// layout.
buddyNodeIDs := make([]string, len(buddyNodes))
for i, buddyNode := range buddyNodes {
buddyNodeIDs[i] = buddyNode.ID
}
n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
return fmt.Errorf(
"applying garage cluster layout, removing nodes %+v: %w",
buddyNodes,
err,
)
}
return nil
}
// returns the bootstrap prior to the reload being applied. // returns the bootstrap prior to the reload being applied.
func (n *network) reload( func (n *network) reload(
ctx context.Context, ctx context.Context,

View File

@ -54,6 +54,7 @@ func TestLoad(t *testing.T) {
} }
func TestJoin(t *testing.T) { func TestJoin(t *testing.T) {
t.Run("simple", func(t *testing.T) {
var ( var (
h = newIntegrationHarness(t) h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil) primus = h.createNetwork(t, "primus", nil)
@ -67,6 +68,30 @@ func TestJoin(t *testing.T) {
assert.NoError(t, err) assert.NoError(t, err)
assert.Equal(t, primusHosts, secondusHosts) assert.Equal(t, primusHosts, secondusHosts)
})
t.Run("with alloc", func(t *testing.T) {
var (
h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil)
secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
networkConfigOpts: &networkConfigOpts{
numStorageAllocs: 1,
},
})
)
t.Log("reloading primus' hosts")
assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))
primusHosts, err := primus.GetHosts(h.ctx)
assert.NoError(t, err)
secondusHosts, err := secondus.GetHosts(h.ctx)
assert.NoError(t, err)
assert.Equal(t, primusHosts, secondusHosts)
})
} }
func TestNetwork_GetConfig(t *testing.T) { func TestNetwork_GetConfig(t *testing.T) {
@ -189,5 +214,59 @@ func TestNetwork_SetConfig(t *testing.T) {
assert.ElementsMatch(t, expRoles, layout.Roles) assert.ElementsMatch(t, expRoles, layout.Roles)
}) })
// TODO a host having allocs but removing all of them t.Run("remove all storage allocs", func(t *testing.T) {
var (
h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil)
secondus = h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
networkConfigOpts: &networkConfigOpts{
numStorageAllocs: 1,
},
})
networkConfig = secondus.getConfig(t)
prevHost = secondus.getHostsByName(t)[secondus.hostName]
removedAlloc = networkConfig.Storage.Allocations[0]
removedRole = allocsToRoles(
secondus.hostName, prevHost.Garage.Instances,
)[0]
removedGarageInst = daecommon.BootstrapGarageHostForAlloc(
prevHost, removedAlloc,
)
primusGarageAdminClient = primus.garageAdminClient(t)
)
networkConfig.Storage.Allocations = nil
assert.NoError(t, secondus.SetConfig(h.ctx, networkConfig))
t.Log("Checking that the Host information was updated")
newHostsByName := primus.getHostsByName(t)
newHost, ok := newHostsByName[secondus.hostName]
assert.True(t, ok)
allocs := newHost.HostConfigured.Garage.Instances
assert.Len(t, allocs, 3)
assert.NotContains(t, allocs, removedGarageInst)
t.Log("Checking that garage layout still contains the old allocation")
layout, err := primusGarageAdminClient.GetLayout(h.ctx)
assert.NoError(t, err)
assert.Contains(t, layout.Roles, removedRole)
t.Log("Removing orphan garage nodes with primus")
assert.NoError(
t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
)
t.Log("Checking that garage layout no longer contains the old allocation")
layout, err = primusGarageAdminClient.GetLayout(h.ctx)
assert.NoError(t, err)
assert.NotContains(t, layout.Roles, removedRole)
})
// TODO make sure that if two nodes each have 3, and one removes all 3, that
// the other is able to retain availability of all data. This means that the
// garage nodes on the node which removed all allocs need to stay online
// some time, in order to replicate data to the leftover nodes.
} }

View File

@ -8,6 +8,7 @@ import (
"io" "io"
"net/http" "net/http"
"net/http/httputil" "net/http/httputil"
"net/netip"
"time" "time"
"dev.mediocregopher.com/mediocre-go-lib.git/mctx" "dev.mediocregopher.com/mediocre-go-lib.git/mctx"
@ -158,38 +159,81 @@ func (c *AdminClient) do(
return nil return nil
} }
// KnownNode describes the fields of a known node in the cluster, as returned
// as part of [ClusterStatus].
type KnownNode struct {
ID string `json:"id"`
Role *Role `json:"role"`
Addr netip.AddrPort `json:"addr"`
IsUp bool `json:"isUp"`
LastSeenSecsAgo int `json:"lastSeenSecsAgo"`
HostName string `json:"hostname"`
}
// Role descibes a node's role in the garage cluster, i.e. what storage it is
// providing.
type Role struct {
ID string `json:"id"`
Capacity int `json:"capacity"` // Gb (SI units)
Zone string `json:"zone"`
Tags []string `json:"tags"`
}
// ClusterLayout describes the layout of the cluster as a whole.
type ClusterLayout struct {
Version int `json:"version"`
Roles []Role `json:"roles"`
StagedRoleChanges []Role `json:"stagedRoleChanges"`
}
// ClusterStatus is returned from the Status endpoint, describing the currently
// known state of the cluster.
type ClusterStatus struct {
Nodes []KnownNode `json:"nodes"`
}
// Status returns the current state of the cluster.
func (c *AdminClient) Status(ctx context.Context) (ClusterStatus, error) {
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes
var clusterStatus ClusterStatus
err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
return clusterStatus, err
}
// Wait will block until the instance connected to can see at least // Wait will block until the instance connected to can see at least
// ReplicationFactor other garage instances. If the context is canceled it // ReplicationFactor other garage instances. If the context is canceled it
// will return the context error. // will return the context error.
func (c *AdminClient) Wait(ctx context.Context) error { func (c *AdminClient) Wait(ctx context.Context) error {
for first := true; ; first = false { for first := true; ; first = false {
if !first { if !first {
time.Sleep(250 * time.Millisecond) select {
case <-time.After(2 * time.Second):
case <-ctx.Done():
return ctx.Err()
}
} }
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Nodes/operation/GetNodes c.logger.Debug(ctx, "Getting cluster status")
var clusterStatus struct { clusterStatus, err := c.Status(ctx)
Nodes []struct {
IsUp bool `json:"isUp"`
} `json:"nodes"`
}
err := c.do(ctx, &clusterStatus, "GET", "/v1/status", nil)
if ctxErr := ctx.Err(); ctxErr != nil { if ctxErr := ctx.Err(); ctxErr != nil {
return ctxErr return ctxErr
} else if err != nil { } else if err != nil {
c.logger.Warn(ctx, "waiting for instance to become ready", err) ctx := mctx.Annotate(ctx, "errMsg", err.Error())
c.logger.Info(ctx, "Instance is not online yet")
continue continue
} }
var numUp int var numUp int
for _, node := range clusterStatus.Nodes { for _, node := range clusterStatus.Nodes {
if node.IsUp { // There seems to be some kind of step between IsUp becoming true
// and garage actually loading the full state of a node, so we check
// for the HostName as well. We could also use LastSeenSecsAgo, but
// that remains null for the node being queried so it's more
// annoying to use.
if node.IsUp && node.HostName != "" {
numUp++ numUp++
} }
} }
@ -204,7 +248,7 @@ func (c *AdminClient) Wait(ctx context.Context) error {
return nil return nil
} }
c.logger.Debug(ctx, "instance not online yet, will continue waiting") c.logger.Info(ctx, "Instance is not joined to the cluster yet")
} }
} }
@ -283,20 +327,6 @@ func (c *AdminClient) GrantBucketPermissions(
}) })
} }
// Role descibes a node's role in the garage cluster, i.e. what storage it is
// providing.
type Role struct {
ID string `json:"id"`
Capacity int `json:"capacity"` // Gb (SI units)
Zone string `json:"zone"`
Tags []string `json:"tags"`
}
// ClusterLayout describes the layout of the cluster as a whole.
type ClusterLayout struct {
Roles []Role `json:"roles"`
}
// GetLayout returns the currently applied ClusterLayout. // GetLayout returns the currently applied ClusterLayout.
func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) { func (c *AdminClient) GetLayout(ctx context.Context) (ClusterLayout, error) {
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout // https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout
@ -323,13 +353,7 @@ func (c *AdminClient) ApplyLayout(
roles = append(roles, removeRole{ID: id, Remove: true}) roles = append(roles, removeRole{ID: id, Remove: true})
} }
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/GetLayout var clusterLayout ClusterLayout
var clusterLayout struct {
Version int `json:"version"`
StagedRoleChanges []Role `json:"stagedRoleChanges"`
}
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
err := c.do(ctx, &clusterLayout, "POST", "/v1/layout", roles) err := c.do(ctx, &clusterLayout, "POST", "/v1/layout", roles)
if err != nil { if err != nil {
return fmt.Errorf("staging layout changes: %w", err) return fmt.Errorf("staging layout changes: %w", err)
@ -337,7 +361,6 @@ func (c *AdminClient) ApplyLayout(
return nil return nil
} }
// https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html#tag/Layout/operation/ApplyLayout
applyClusterLayout := struct { applyClusterLayout := struct {
Version int `json:"version"` Version int `json:"version"`
}{ }{