Get rid of orphan removal after investigation into race conditions, new solution pending

This commit is contained in:
Brian Picciano 2025-01-02 14:08:24 +01:00
parent e3d4fc5a8e
commit 9e508ef4e2
10 changed files with 358 additions and 176 deletions

View File

@ -13,16 +13,13 @@ import (
"isle/secrets" "isle/secrets"
"isle/toolkit" "isle/toolkit"
"net" "net"
"net/netip"
"path/filepath" "path/filepath"
"slices"
"strconv" "strconv"
"time" "time"
"dev.mediocregopher.com/mediocre-go-lib.git/mctx" "dev.mediocregopher.com/mediocre-go-lib.git/mctx"
"dev.mediocregopher.com/mediocre-go-lib.git/mlog" "dev.mediocregopher.com/mediocre-go-lib.git/mlog"
"github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7"
"golang.org/x/exp/maps"
) )
// Paths within garage's global bucket. // Paths within garage's global bucket.
@ -218,6 +215,7 @@ func getGarageBootstrapHosts(
host, err := authedHost.Unwrap(currBootstrap.CAPublicCredentials) host, err := authedHost.Unwrap(currBootstrap.CAPublicCredentials)
if err != nil { if err != nil {
logger.Warn(ctx, "Host could not be authenticated", err) logger.Warn(ctx, "Host could not be authenticated", err)
continue
} }
hosts[host.Name] = host hosts[host.Name] = host
@ -363,59 +361,3 @@ func garageWaitForAlloc(
return nil return nil
} }
// garageNodeBuddyPeers returns the "buddy" peers of the given host, based on
// the given garage cluster status. It will return zero values if the host has
// no buddy.
//
// For situations where we want one host to affect the cluster layout of another
// host's peers, we use a simple system to determine a single host which is
// responsible. The goal is not to be 100% race-proof (garage handles that), but
// rather to try to prevent all hosts from modifying the same host's layout at
// the same time.
//
// The system is to order all hosts by their IP, and say that each host is
// responsible for (aka the "buddy" of) the host immediately after their own in
// that list. The last host in that list is responsible for the first.
func garageNodeBuddyPeers(
status garage.ClusterStatus, host bootstrap.Host,
) (
netip.Addr, []garage.Role,
) {
var (
thisIP = host.IP()
nodeRolesByIP = map[netip.Addr][]garage.Role{}
)
for _, node := range status.Nodes {
if node.Role == nil {
continue
}
ip := node.Addr.Addr()
nodeRolesByIP[ip] = append(nodeRolesByIP[ip], *node.Role)
}
// If there is only a single host in the cluster (or, somehow, none) then
// that host has no buddy.
if len(nodeRolesByIP) < 2 {
return netip.Addr{}, nil
}
nodeIPs := maps.Keys(nodeRolesByIP)
slices.SortFunc(nodeIPs, netip.Addr.Compare)
for i, nodeIP := range nodeIPs {
var buddyIP netip.Addr
if i == len(nodeIPs)-1 {
buddyIP = nodeIPs[0]
} else if nodeIP == thisIP {
buddyIP = nodeIPs[i+1]
} else {
continue
}
return buddyIP, nodeRolesByIP[buddyIP]
}
panic("Unreachable")
}

View File

@ -160,6 +160,9 @@ type Opts struct {
// used, either that which it was most recently initialized with or which // used, either that which it was most recently initialized with or which
// was passed to [SetConfig]. // was passed to [SetConfig].
Config *daecommon.NetworkConfig Config *daecommon.NetworkConfig
// testBlocker is used by tests to set blockpoints.
testBlocker *toolkit.TestBlocker
} }
func (o *Opts) withDefaults() *Opts { func (o *Opts) withDefaults() *Opts {
@ -508,10 +511,6 @@ func (n *network) initialize(
n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute) n.periodically("reloadHosts", n.reloadHosts, 3*time.Minute)
n.periodically(
"removeOrphanGarageNodes", n.removeOrphanGarageNodes, 1*time.Minute,
)
return nil return nil
} }
@ -603,77 +602,6 @@ func (n *network) reloadHosts(ctx context.Context) error {
return nil return nil
} }
// In general each host will manage the garage cluster layout of its own storage
// allocations via garageApplyLayout. There are three situations which are
// handled here, rather than garageApplyLayout:
//
// - A host removes all of its allocations via SetConfig.
// - A host removes all of its allocations by calling Load with no allocations
// in the provided daecommon.NetworkConfig.
// - A host is removed from the network by another host.
//
// In all of these cases the host no longer has any garage instances running,
// and so can't call garageApplyLayout on itself. To combat this we have all
// hosts which do have garage instances running periodically check that there's
// not some garage nodes orphaned in the cluster layout, and remove them if so.
func (n *network) removeOrphanGarageNodes(ctx context.Context) error {
n.l.RLock()
defer n.l.RUnlock()
thisHost := n.currBootstrap.ThisHost()
if len(thisHost.Garage.Instances) == 0 {
n.logger.Info(ctx, "No local garage instances, cannot remove orphans")
return nil
}
adminClient := newGarageAdminClient(
n.logger, n.networkConfig, n.opts.GarageAdminToken, thisHost,
)
defer adminClient.Close()
clusterStatus, err := adminClient.Status(ctx)
if err != nil {
return fmt.Errorf("retrieving garage cluster status: %w", err)
}
buddyIP, buddyNodes := garageNodeBuddyPeers(clusterStatus, thisHost)
if len(buddyNodes) == 0 {
return nil
}
ctx = mctx.Annotate(ctx, "buddyIP", buddyIP)
for _, host := range n.currBootstrap.Hosts {
if host.IP() != buddyIP {
continue
} else if len(host.Garage.Instances) > 0 {
n.logger.Info(ctx, "Buddy instance has garage nodes configured in its bootstrap, doing nothing")
return nil
}
break
}
// Either the host is no longer in the network, or it no longer has any
// garage instances set on it. Either way, remove its nodes from the cluster
// layout.
buddyNodeIDs := make([]string, len(buddyNodes))
for i, buddyNode := range buddyNodes {
buddyNodeIDs[i] = buddyNode.ID
}
n.logger.Info(ctx, "Applying garage layout to remove orphaned garage nodes")
if err := adminClient.ApplyLayout(ctx, nil, buddyNodeIDs); err != nil {
return fmt.Errorf(
"applying garage cluster layout, removing nodes %+v: %w",
buddyNodes,
err,
)
}
return nil
}
// returns the bootstrap prior to the reload being applied. // returns the bootstrap prior to the reload being applied.
func (n *network) reload( func (n *network) reload(
ctx context.Context, ctx context.Context,

View File

@ -1,15 +1,18 @@
package network package network
import ( import (
"fmt"
"isle/bootstrap" "isle/bootstrap"
"isle/daemon/daecommon" "isle/daemon/daecommon"
"isle/garage" "isle/garage"
"isle/garage/garagesrv" "isle/garage/garagesrv"
"isle/jsonutil" "isle/jsonutil"
"isle/nebula" "isle/nebula"
"isle/toolkit"
"os" "os"
"path/filepath" "path/filepath"
"testing" "testing"
"time"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@ -71,7 +74,7 @@ func TestJoin(t *testing.T) {
assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t)) assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t))
}) })
t.Run("with alloc", func(t *testing.T) { t.Run("with alloc/simple", func(t *testing.T) {
var ( var (
h = newIntegrationHarness(t) h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil) primus = h.createNetwork(t, "primus", nil)
@ -86,6 +89,61 @@ func TestJoin(t *testing.T) {
assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx)) assert.NoError(t, primus.Network.(*network).reloadHosts(h.ctx))
assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t)) assert.Equal(t, primus.getHostsByName(t), secondus.getHostsByName(t))
assertGarageLayout(t, map[*integrationHarnessNetwork]int{
primus: 3,
secondus: 1,
})
})
// Assert that if primus runs the orphan remover at the same moment that
// secondus is joining that the layout applied by secondus doesn't get
// overwritten.
t.Run("with alloc/remove orphans after garage layout applied", func(t *testing.T) {
t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly")
var (
h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil)
primusAdminClient = primus.garageAdminClient(t)
secondusBlocker = toolkit.NewTestBlocker(t)
)
secondusBlocker.ExpectBlockpoint("garageLayoutApplied").On(
t, h.ctx, func() {
h.logger.Info(h.ctx, "Waiting for new layout to propagate to primus")
err := toolkit.UntilTrue(
h.ctx, h.logger, 1*time.Second, func() (bool, error) {
layout, err := primusAdminClient.GetLayout(h.ctx)
if err != nil {
return false, fmt.Errorf("getting layout: %w", err)
}
return len(layout.Roles) == 4, nil
},
)
if !assert.NoError(t, err) {
return
}
//h.logger.Info(h.ctx, "Calling removeOrphanGarageNodes")
//assert.NoError(
// t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
//)
},
)
secondus := h.joinNetwork(t, primus, "secondus", &joinNetworkOpts{
networkConfigOpts: &networkConfigOpts{
numStorageAllocs: 1,
},
blocker: secondusBlocker,
})
assertGarageLayout(t, map[*integrationHarnessNetwork]int{
primus: 3,
secondus: 1,
})
}) })
} }
@ -285,6 +343,8 @@ func TestNetwork_SetConfig(t *testing.T) {
}) })
t.Run("remove all storage allocs", func(t *testing.T) { t.Run("remove all storage allocs", func(t *testing.T) {
t.Skip("This is currently expected to fail. Orphan removal is going to be reworked accordingly")
var ( var (
h = newIntegrationHarness(t) h = newIntegrationHarness(t)
primus = h.createNetwork(t, "primus", nil) primus = h.createNetwork(t, "primus", nil)
@ -319,10 +379,10 @@ func TestNetwork_SetConfig(t *testing.T) {
assert.NoError(t, err) assert.NoError(t, err)
assert.Contains(t, layout.Roles, removedRole) assert.Contains(t, layout.Roles, removedRole)
t.Log("Removing orphan garage nodes with primus") //t.Log("Removing orphan garage nodes with primus")
assert.NoError( //assert.NoError(
t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx), // t, primus.Network.(*network).removeOrphanGarageNodes(h.ctx),
) //)
t.Log("Checking that garage layout no longer contains the old allocation") t.Log("Checking that garage layout no longer contains the old allocation")
layout, err = primusGarageAdminClient.GetLayout(h.ctx) layout, err = primusGarageAdminClient.GetLayout(h.ctx)

View File

@ -1,6 +1,7 @@
package network package network
import ( import (
"cmp"
"context" "context"
"fmt" "fmt"
"isle/bootstrap" "isle/bootstrap"
@ -11,6 +12,7 @@ import (
"isle/toolkit" "isle/toolkit"
"os" "os"
"path/filepath" "path/filepath"
"slices"
"sync" "sync"
"sync/atomic" "sync/atomic"
"testing" "testing"
@ -251,6 +253,7 @@ type joinNetworkOpts struct {
*networkConfigOpts *networkConfigOpts
canCreateHosts bool canCreateHosts bool
manualShutdown bool manualShutdown bool
blocker *toolkit.TestBlocker
} }
func (o *joinNetworkOpts) withDefaults() *joinNetworkOpts { func (o *joinNetworkOpts) withDefaults() *joinNetworkOpts {
@ -288,6 +291,7 @@ func (h *integrationHarness) joinNetwork(
networkOpts = &Opts{ networkOpts = &Opts{
GarageAdminToken: "admin_token", GarageAdminToken: "admin_token",
Config: &networkConfig, Config: &networkConfig,
testBlocker: opts.blocker,
} }
) )
@ -382,3 +386,56 @@ func (nh *integrationHarnessNetwork) getHostsByName(
require.NoError(t, err) require.NoError(t, err)
return currBootstrap.Hosts return currBootstrap.Hosts
} }
func assertGarageLayout(
t *testing.T,
wantLayout map[*integrationHarnessNetwork]int, // network -> num allocs
) {
wantLayoutSimple := map[string]int{}
for nh, wantAllocs := range wantLayout {
wantLayoutSimple[string(nh.hostName)] = wantAllocs
}
normalizeLayout := func(layout *garage.ClusterLayout) {
slices.SortFunc(layout.Roles, func(a, b garage.Role) int {
return cmp.Compare(a.ID, b.ID)
})
}
assertSingle := func(
nh *integrationHarnessNetwork, layout garage.ClusterLayout,
) {
gotLayoutSimple := map[string]int{}
for _, role := range layout.Roles {
gotLayoutSimple[role.Zone]++
}
assert.Equal(t, wantLayoutSimple, gotLayoutSimple, "layout from %q", nh.hostName)
}
var (
lastLayoutHostName nebula.HostName
lastLayout garage.ClusterLayout
)
for nh := range wantLayout {
layout, err := nh.garageAdminClient(t).GetLayout(nh.ctx)
assert.NoError(t, err)
normalizeLayout(&layout)
assertSingle(nh, layout)
if lastLayoutHostName != "" {
assert.Equal(
t,
lastLayout,
layout,
"layout of %q not equal to layout of %q",
lastLayoutHostName,
nh.hostName,
)
}
lastLayoutHostName = nh.hostName
lastLayout = layout
}
}

View File

@ -0,0 +1,147 @@
package toolkit
import (
"context"
"sync"
"testing"
"github.com/stretchr/testify/assert"
)
// ExpectedBlockpoint represents the expectation that Blockpoint will be called
// on a TestBlocker. It is possible to both wait for the Blockpoint call to
// occur and to unblock it once it has occured.
type ExpectedBlockpoint struct {
waitCh chan struct{}
unblockCh chan struct{}
}
// Wait will block until blockpoint has been hit and is itself blocking, or will
// return the context error.
func (eb ExpectedBlockpoint) Wait(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-eb.waitCh:
return nil
}
}
// Unblock unblocks the Blockpoint call which is/was expected. If Unblock can be
// called prior to Wait being called (and therefore prior to the Blockpoint
// being hit).
func (eb ExpectedBlockpoint) Unblock() {
close(eb.unblockCh)
}
// On is a helper which will spawn a go-routine, call Wait on the
// ExpectedBlockpoint, call the given callback, and then Unblock the
// ExpectedBlockpoint.
//
// If Wait returns an error (due to context cancellation) then this fails the
// test and returns without calling the callback.
func (eb ExpectedBlockpoint) On(t *testing.T, ctx context.Context, cb func()) {
go func() {
defer eb.Unblock()
if !assert.NoError(t, eb.Wait(ctx)) {
return
}
cb()
}()
}
// TestBlocker is used as an injected dependency into components, so that tests
// can cause those components to block at specific execution points internally.
// This is useful for testing race conditions between multiple components.
//
// A TestBlocker is initialized using `new`. A nil TestBlocker will never block.
type TestBlocker struct {
l sync.Mutex
blockpointsByID map[string][]ExpectedBlockpoint
blocksByID map[string]int
}
// NewTestBlocker initializes a TestBlocker and registers a Cleanup callback on
// the T which will call AssertExpectations.
func NewTestBlocker(t *testing.T) *TestBlocker {
b := new(TestBlocker)
t.Cleanup(func() { b.AssertExpectations(t) })
return b
}
// Blockpoint will block if and only if TestBlocker is non-nil and
// ExpectBlockpoint has been called with the same ID previously. If the context
// is canceled while blocking then this call will return.
func (b *TestBlocker) Blockpoint(ctx context.Context, id string) {
if b == nil {
return
}
b.l.Lock()
blockpoints := b.blockpointsByID[id]
if len(blockpoints) == 0 {
b.l.Unlock()
return
}
blockpoint, blockpoints := blockpoints[0], blockpoints[1:]
b.blockpointsByID[id] = blockpoints
b.blocksByID[id]++
b.l.Unlock()
close(blockpoint.waitCh)
select {
case <-ctx.Done():
case <-blockpoint.unblockCh:
}
}
// ExpectBlockpoint will cause the TestBlocker to block upon the next call to
// Blockpoint using the same id. The returned ExpectBlockpoint can be used to
// wait until Blockpoint is called, as well as to unblock it.
func (b *TestBlocker) ExpectBlockpoint(id string) ExpectedBlockpoint {
b.l.Lock()
defer b.l.Unlock()
if b.blockpointsByID == nil {
b.blockpointsByID = map[string][]ExpectedBlockpoint{}
}
if b.blocksByID == nil {
b.blocksByID = map[string]int{}
}
blockpoint := ExpectedBlockpoint{
waitCh: make(chan struct{}),
unblockCh: make(chan struct{}),
}
b.blockpointsByID[id] = append(b.blockpointsByID[id], blockpoint)
return blockpoint
}
// AssertExpectations will Fail the test and return false if any calls to
// ExpectBlockpoint have not had a corresponding Blockpoint call made.
func (b *TestBlocker) AssertExpectations(t *testing.T) bool {
b.l.Lock()
defer b.l.Unlock()
var failed bool
for id, blockpoints := range b.blockpointsByID {
if len(blockpoints) == 0 {
continue
}
failed = true
t.Errorf(
"Blockpoint(%q) called %d times, expected %d more",
id, b.blocksByID[id], len(blockpoints),
)
}
return !failed
}

View File

@ -1,14 +0,0 @@
---
type: task
---
When removing a storage allocation the new layout should be applied _before_ the
impacted garage instance is shut down. Isle should ideally also wait for the
impacted instance to no longer be in the "draining" state prior to shutting it
down, if possible.
Some care needs to be taken in the case of the `daemon.yml` file being used for
configuration. The daemon should probably initially load using the old
configuration, and only then apply the new configuration as if it were applied
using `SetConfig`. This way the garage instance being removed can be brought
back up, drained, then shut down again.

View File

@ -0,0 +1,84 @@
---
type: task
---
## Problem
There are high-level but extremely problematic issues with how garage layout
management is being done.
In general the strategy around layout management is that each host only modifies
the cluster layout related to itself, and never touches applied roles of other
hosts. This works great for all except one case: a host removing one or more of
its allocations.
There are two separate issues which must be dealt with, each related partially
to the other.
## Draining of garage data
When a garage node is removed from the cluster it first goes into the "draining"
state, so that other nodes in the cluster can ensure that the replication factor
for each piece of data is met prior to the node being decommissioned.
While the node is in draining state it cannot be used for S3 API calls, as the
bucket credentials are no longer present on it.
## Configuration change on restart
For hosts whose configuration is managed by `daemon.yml` it is not necessarily
known that a garage node used to exist at all upon restart. The host can't
investigate the cluster layout because it won't have a garage instance running,
and even if it could it wouldn't be able to bring up a garage node to properly
drain the old allocations.
# Invalid Solutions
One solution which is tempting but ultimately NOT viable is to make all hosts
run at least one garage instance, and if they have no storage allocations to
make that instance be a "gateway" instance. This is won't work though, because
it would require all hosts to open up the RPC port on their firewall, and
firewall management requires extra user involvement.
Another previous solution was to use an "orphan remover" process on each host,
where the host would compare the garage cluster layout to the expected layout
based on the bootstrap data in the common bucket, and remove any hosts from the
layout which shouldn't be there and don't have a garage instance to remove
themselves with. This had a bunch of unresolveable race conditions, and it
didn't account for draining besides.
# Possible Solution
The solution seems to be that the host must maintain two views of its garage
allocations: the last known allocation state, and the desired allocation state.
The last known state needs to contain both what state the allocation was in
(healthy or draining), along with its directories and capacity. This should get
updated anytime the host performs an action which changes it (modifying the
cluster layout to add a new instance or move an existing one to draining, or
actually removing an instance which is done draining).
The desired state is essentially the network configuration as it is now. This
will be used along with the last known state to take actions.
There are a few details to note with this solution:
- There will need to be a worker which periodically checks the last known state
for any nodes which were draining, and if they are done draining then remove
them.
- When the host starts up it should _always_ use the last known state, and only
once started up should it go to apply the desired configuration.
- When choosing an admin endpoint to use the last known state should be used,
even though it might result in unexpected behavior from the user's perspective
(since the user only knows about the desired state). This applies for RPC
endpoints as well.
- The last/desired states need to be checked for conflicts, and an error emitted
in the event that there is one (either returned from SetConfig or Load). This
includes a new allocation using the same directory as an old one (based on RPC
port), or two allocations using the same RPC port.
- The nebula firewall must base its opened ports on the last known state rather
than desired state.

View File

@ -1,8 +0,0 @@
---
type: task
---
There is a race condition in the garage orphan removal worker. Basically when
host A is adding its first allocation, it's possible that it hasn't yet updated
the host info in the common bucket, and so host B removes it from the layout
prior to that host info being updated.

View File

@ -1,14 +0,0 @@
---
type: task
after:
- ./garage-apply-layout-before-stopping-instance.md
---
I think there is currently a bug related to re-adding a storage allocation on an
RPC port of a garage instance which was previously used:
- Step 1) Remove a storage allocation using RPC port N
- Step 2) Add a new allocation using RPC port N, but with a new data/meta dir
I believe in this case garage will go back to using the old data/meta dir, and
possibly even re-use the old pubkey.

View File

@ -1,7 +1,7 @@
--- ---
type: task type: task
after: after:
- ./garage-apply-layout-before-stopping-instance.md - ./garage-layout-management.md
--- ---
When SetConfig is called, but ends up erroring, the new config should not end up When SetConfig is called, but ends up erroring, the new config should not end up