layout: refactoring and fix in layout helper

layout: refactor/fix bad while loop
fix some clippy lints
2023-12-11 16:09:22 +01:00 · 2023-12-11 15:45:14 +01:00 · 2023-12-11 15:31:47 +01:00 · 2023-12-11 14:57:42 +01:00 · 2023-12-08 14:54:11 +01:00 · 2023-12-08 14:15:52 +01:00
67 changed files with 10125 additions and 3667 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@ -0,0 +1,3 @@
+[target.x86_64-unknown-linux-gnu]
+linker = "clang"
+rustflags = ["-C", "link-arg=-fuse-ld=mold"]
--- a/.gitignore
+++ b/.gitignore
@ -4,5 +4,3 @@
 **/*.rs.bk
 *.swp
 /.direnv
-/.cargo
-/result
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1271,6 +1271,7 @@ dependencies = [
 "http-range",
 "httpdate",
 "hyper",
+ "hyperlocal",
 "idna",
 "md-5",
 "multer",
@ -1464,6 +1465,7 @@ dependencies = [
 "garage_util",
 "http",
 "hyper",
+ "hyperlocal",
 "opentelemetry",
 "percent-encoding",
 "tokio",
@ -1776,6 +1778,19 @@ dependencies = [
 "tokio-io-timeout",
 ]

+[[package]]
+name = "hyperlocal"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fafdf7b2b2de7c9784f76e02c0935e65a8117ec3b768644379983ab333ac98c"
+dependencies = [
+ "futures-util",
+ "hex",
+ "hyper",
+ "pin-project",
+ "tokio",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.57"
--- a/Cargo.nix
+++ b/Cargo.nix
--- a/default.nix
+++ b/default.nix
@ -1,166 +1,56 @@
-{
-  buildSystem ? builtins.currentSystem,
-  targetSystem ? buildSystem,
-  gitVersion ? null,
-  release ? false,
-  features ? null,
-}:
+{ system ? builtins.currentSystem, git_version ? null, }:
+
+with import ./nix/common.nix;

 let
-  pkgsSrc = import ./nix/pkgs.nix;
-  newBuildTarget = {
-    nixPkgsSystem,
-    rustTarget ? nixPkgsSystem,
-    nativeBuildInputs ? pkgsCross: [],
-    rustFlags ? pkgsCross: [],
-  }: {
-    inherit nixPkgsSystem rustTarget nativeBuildInputs rustFlags;
+  pkgs = import pkgsSrc { };
+  compile = import ./nix/compile.nix;
+
+  build_debug_and_release = (target: {
+    debug = (compile {
+      inherit system target git_version pkgsSrc cargo2nixOverlay;
+      release = false;
+    }).workspace.garage { compileMode = "build"; };
+
+    release = (compile {
+      inherit system target git_version pkgsSrc cargo2nixOverlay;
+      release = true;
+    }).workspace.garage { compileMode = "build"; };
+  });
+
+  test = (rustPkgs:
+    pkgs.symlinkJoin {
+      name = "garage-tests";
+      paths =
+        builtins.map (key: rustPkgs.workspace.${key} { compileMode = "test"; })
+        (builtins.attrNames rustPkgs.workspace);
+    });
+
+in {
+  pkgs = {
+    amd64 = build_debug_and_release "x86_64-unknown-linux-musl";
+    i386 = build_debug_and_release "i686-unknown-linux-musl";
+    arm64 = build_debug_and_release "aarch64-unknown-linux-musl";
+    arm = build_debug_and_release "armv6l-unknown-linux-musleabihf";
  };
-
-  # centralize per-target configuration in a single place.
-  buildTargets = {
-    "x86_64-linux" = newBuildTarget {
-      nixPkgsSystem = "x86_64-unknown-linux-musl";
-    };
-
-    "i686-linux" = newBuildTarget {
-      nixPkgsSystem = "i686-unknown-linux-musl";
-    };
-
-    "aarch64-linux" = newBuildTarget {
-      nixPkgsSystem = "aarch64-unknown-linux-musl";
-    };
-
-    # Old Raspberry Pi's (not currently supported due to linking errors with
-    # libsqlite3 and libsodium
-    #"armv6l-linux" = newBuildTarget {
-    #  nixPkgsSystem = "armv6l-unknown-linux-musleabihf";
-    #  rustTarget = "arm-unknown-linux-musleabihf";
-    #};
-
-    "x86_64-windows" = newBuildTarget {
-      nixPkgsSystem = "x86_64-w64-mingw32";
-      rustTarget = "x86_64-pc-windows-gnu";
-      nativeBuildInputs = pkgsCross: [ pkgsCross.windows.pthreads ];
-      rustFlags = pkgsCross: [
-        "-C" "link-arg=-L${pkgsCross.windows.pthreads}/lib"
-      ];
-    };
-  };
-
-  buildTarget = buildTargets.${targetSystem};
-
-  pkgs = import pkgsSrc { system = buildSystem; };
-  pkgsCross = import pkgsSrc {
-    system = buildSystem;
-    crossSystem.config = buildTarget.nixPkgsSystem;
-  };
-
-  rustTarget = buildTarget.rustTarget;
-
-  toolchain = let
-    fenix = import (pkgs.fetchFromGitHub {
-      owner = "nix-community";
-      repo = "fenix";
-      rev = "81ab0b4f7ae9ebb57daa0edf119c4891806e4d3a";
-      hash = "sha256-bZmI7ytPAYLpyFNgj5xirDkKuAniOkj1xHdv5aIJ5GM=";
-    }) {
-      system = buildSystem;
-    };
-
-    mkToolchain = fenixTarget: fenixTarget.toolchainOf {
-      channel = "1.68.2";
-      sha256 = "sha256-4vetmUhTUsew5FODnjlnQYInzyLNyDwocGa4IvMk3DM=";
-    };
-  in
-    fenix.combine [
-      (mkToolchain fenix).rustc
-      (mkToolchain fenix).rustfmt
-      (mkToolchain fenix).cargo
-      (mkToolchain fenix).clippy
-      (mkToolchain fenix.targets.${rustTarget}).rust-std
-    ];
-
-  naersk = let
-    naerskSrc = pkgs.fetchFromGitHub {
-      owner = "nix-community";
-      repo = "naersk";
-      rev = "d9a33d69a9c421d64c8d925428864e93be895dcc";
-      hash = "sha256-e136hTT7LqQ2QjOTZQMW+jnsevWwBpMj78u6FRUsH9I=";
-    };
-  in
-    pkgs.callPackages naerskSrc {
-      cargo = toolchain;
-      rustc = toolchain;
-    };
-
-  builtFeatures = if features != null then
-      features
-  else (
-    [ "garage/bundled-libs" "garage/sled" "garage/lmdb" "garage/k2v" ] ++ (
-      if release then [
-        "garage/consul-discovery"
-        "garage/kubernetes-discovery"
-        "garage/metrics"
-        "garage/telemetry-otlp"
+  test = {
+    amd64 = test (compile {
+      inherit system git_version pkgsSrc cargo2nixOverlay;
+      target = "x86_64-unknown-linux-musl";
+      features = [
+        "garage/bundled-libs"
+        "garage/k2v"
+        "garage/sled"
+        "garage/lmdb"
        "garage/sqlite"
-      ] else [ ]
-    )
-  );
-
-  # For some reason the pkgsCross.pkgsStatic build of libsodium doesn't contain
-  # a `.a` file when compiled to a windows target, but rather contains
-  # a `.dll.a` file which libsodium-sys doesn't pick up on. Copying the one to
-  # the be the other seems to work.
-  libsodium = pkgs.runCommand "libsodium-wrapped" {
-    libsodium = pkgsCross.pkgsStatic.libsodium;
-  } ''
-    cp -rL "$libsodium" "$out"
-    chmod -R +w "$out"
-    if [ ! -e "$out"/lib/libsodium.a ] && [ -f "$out"/lib/libsodium.dll.a ]; then
-      cp "$out"/lib/libsodium.dll.a "$out"/lib/libsodium.a
-    fi
-  '';
-
-in rec {
-  inherit pkgs pkgsCross;
-
-  # Exported separately so it can be used from shell.nix
-  buildEnv = rec {
-    nativeBuildInputs = (buildTarget.nativeBuildInputs pkgsCross) ++ [
-      toolchain
-      pkgs.protobuf
-
-      # Required for shell because of rust dependency build scripts which must
-      # run on the build system.
-      pkgs.stdenv.cc
      ];
-
-    SODIUM_LIB_DIR = "${libsodium}/lib";
-
-    # Required because ring crate is special. This also seems to have
-    # fixed some issues with the x86_64-windows cross-compile :shrug:
-    TARGET_CC = "${pkgsCross.stdenv.cc}/bin/${pkgsCross.stdenv.cc.targetPrefix}cc";
-
-    CARGO_BUILD_TARGET = rustTarget;
-    CARGO_BUILD_RUSTFLAGS = [
-      "-C" "target-feature=+crt-static"
-      "-C" "link-arg=-static"
-
-      # https://github.com/rust-lang/cargo/issues/4133
-      "-C" "linker=${TARGET_CC}"
-    ] ++ (buildTarget.rustFlags pkgsCross);
+    });
+  };
+  clippy = {
+    amd64 = (compile {
+      inherit system git_version pkgsSrc cargo2nixOverlay;
+      target = "x86_64-unknown-linux-musl";
+      compiler = "clippy";
+    }).workspace.garage { compileMode = "build"; };
  };
-
-  build = naersk.buildPackage (rec {
-    inherit release;
-
-    src = ./.;
-    strictDeps = true;
-    doCheck = false;
-
-    cargoBuildOptions = prev: prev++[
-      "--features=${builtins.concatStringsSep "," builtFeatures}"
-    ];
-  } // buildEnv);
 }
--- a/doc/api/garage-admin-v1.html
+++ b/doc/api/garage-admin-v1.html
@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-  <head>
-    <title>Garage Adminstration API v0</title>
-    <!-- needed for adaptive design -->
-    <meta charset="utf-8"/>
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <link href="./css/redoc.css" rel="stylesheet">
-
-    <!--
-    Redoc doesn't change outer page styles
-    -->
-    <style>
-      body {
-        margin: 0;
-        padding: 0;
-      }
-    </style>
-  </head>
-  <body>
-    <redoc spec-url='./garage-admin-v1.yml'></redoc>
-    <script src="./redoc.standalone.js"> </script>
-  </body>
-</html>
--- a/doc/api/garage-admin-v1.yml
+++ b/doc/api/garage-admin-v1.yml
--- a/doc/book/build/golang.md
+++ b/doc/book/build/golang.md
@ -37,84 +37,30 @@ import (
    "context"
    "fmt"
    "os"
-    "strings"
    garage "git.deuxfleurs.fr/garage-sdk/garage-admin-sdk-golang"
 )

 func main() {
-    // Initialization
+    // Set Host and other parameters
    configuration := garage.NewConfiguration()
    configuration.Host = "127.0.0.1:3903"
+
+
+    // We can now generate a client
    client := garage.NewAPIClient(configuration)
+
+    // Authentication is handled through the context pattern
    ctx := context.WithValue(context.Background(), garage.ContextAccessToken, "s3cr3t")

-    // Nodes
-    fmt.Println("--- nodes ---")
-    nodes, _, _ := client.NodesApi.GetNodes(ctx).Execute()
-    fmt.Fprintf(os.Stdout, "First hostname: %v\n", nodes.KnownNodes[0].Hostname)
-    capa := int64(1000000000)
-    change := []garage.NodeRoleChange{
-	    garage.NodeRoleChange{NodeRoleUpdate: &garage.NodeRoleUpdate {
-	        Id: *nodes.KnownNodes[0].Id,
-	        Zone: "dc1",
-	        Capacity: *garage.NewNullableInt64(&capa),
-	        Tags: []string{ "fast", "amd64" },
-	    }},
+    // Send a request
+    resp, r, err := client.NodesApi.GetNodes(ctx).Execute()
+    if err != nil {
+        fmt.Fprintf(os.Stderr, "Error when calling `NodesApi.GetNodes``: %v\n", err)
+        fmt.Fprintf(os.Stderr, "Full HTTP response: %v\n", r)
    }
-    staged, _, _ := client.LayoutApi.AddLayout(ctx).NodeRoleChange(change).Execute()
-    msg, _, _ := client.LayoutApi.ApplyLayout(ctx).LayoutVersion(*garage.NewLayoutVersion(staged.Version + 1)).Execute()
-    fmt.Printf(strings.Join(msg.Message, "\n")) // Layout configured

-    health, _, _ := client.NodesApi.GetHealth(ctx).Execute()
-    fmt.Printf("Status: %s, nodes: %v/%v, storage: %v/%v, partitions: %v/%v\n", health.Status, health.ConnectedNodes, health.KnownNodes, health.StorageNodesOk, health.StorageNodes, health.PartitionsAllOk, health.Partitions)
-
-    // Key
-    fmt.Println("\n--- key ---")
-    key := "openapi-key"
-    keyInfo, _, _ := client.KeyApi.AddKey(ctx).AddKeyRequest(garage.AddKeyRequest{Name: *garage.NewNullableString(&key) }).Execute()
-    defer client.KeyApi.DeleteKey(ctx).Id(*keyInfo.AccessKeyId).Execute()
-    fmt.Printf("AWS_ACCESS_KEY_ID=%s\nAWS_SECRET_ACCESS_KEY=%s\n", *keyInfo.AccessKeyId, *keyInfo.SecretAccessKey.Get())
-
-    id := *keyInfo.AccessKeyId
-    canCreateBucket := true
-    updateKeyRequest := *garage.NewUpdateKeyRequest()
-    updateKeyRequest.SetName("openapi-key-updated")
-    updateKeyRequest.SetAllow(garage.UpdateKeyRequestAllow { CreateBucket: &canCreateBucket })
-    update, _, _ := client.KeyApi.UpdateKey(ctx).Id(id).UpdateKeyRequest(updateKeyRequest).Execute()
-    fmt.Printf("Updated %v with key name %v\n", *update.AccessKeyId, *update.Name)
-
-    keyList, _, _ := client.KeyApi.ListKeys(ctx).Execute()
-    fmt.Printf("Keys count: %v\n", len(keyList))
-
-    // Bucket
-    fmt.Println("\n--- bucket ---")
-    global_name := "global-ns-openapi-bucket"
-    local_name := "local-ns-openapi-bucket"
-    bucketInfo, _, _ := client.BucketApi.CreateBucket(ctx).CreateBucketRequest(garage.CreateBucketRequest{
-        GlobalAlias: &global_name,
-        LocalAlias: &garage.CreateBucketRequestLocalAlias {
-            AccessKeyId: keyInfo.AccessKeyId,
-	    Alias: &local_name,
-        },
-    }).Execute()
-    defer client.BucketApi.DeleteBucket(ctx).Id(*bucketInfo.Id).Execute()
-    fmt.Printf("Bucket id: %s\n", *bucketInfo.Id)
-
-    updateBucketRequest := *garage.NewUpdateBucketRequest()
-    website := garage.NewUpdateBucketRequestWebsiteAccess()
-    website.SetEnabled(true)
-    website.SetIndexDocument("index.html")
-    website.SetErrorDocument("errors/4xx.html")
-    updateBucketRequest.SetWebsiteAccess(*website)
-    quotas := garage.NewUpdateBucketRequestQuotas()
-    quotas.SetMaxSize(1000000000)
-    quotas.SetMaxObjects(999999999)
-    updateBucketRequest.SetQuotas(*quotas)
-    updatedBucket, _, _ := client.BucketApi.UpdateBucket(ctx).Id(*bucketInfo.Id).UpdateBucketRequest(updateBucketRequest).Execute()
-    fmt.Printf("Bucket %v website activation: %v\n", *updatedBucket.Id, *updatedBucket.WebsiteAccess)
-
-    bucketList, _, _ := client.BucketApi.ListBuckets(ctx).Execute()
-    fmt.Printf("Bucket count: %v\n", len(bucketList))
+    // Process the response
+    fmt.Fprintf(os.Stdout, "Target hostname: %v\n", resp.KnownNodes[resp.Node].Hostname)
 }
 ```

--- a/doc/book/build/javascript.md
+++ b/doc/book/build/javascript.md
@ -31,9 +31,9 @@ npm install --save git+https://git.deuxfleurs.fr/garage-sdk/garage-admin-sdk-js.
 A short example:

 ```javascript
-const garage = require('garage_administration_api_v1garage_v0_9_0');
+const garage = require('garage_administration_api_v0garage_v0_8_0');

-const api = new garage.ApiClient("http://127.0.0.1:3903/v1");
+const api = new garage.ApiClient("http://127.0.0.1:3903/v0");
 api.authentications['bearerAuth'].accessToken = "s3cr3t";

 const [node, layout, key, bucket] = [
--- a/doc/book/build/python.md
+++ b/doc/book/build/python.md
@ -80,7 +80,7 @@ from garage_admin_sdk.apis import *
 from garage_admin_sdk.models import *

 configuration = garage_admin_sdk.Configuration(
-  host = "http://localhost:3903/v1",
+  host = "http://localhost:3903/v0",
  access_token = "s3cr3t"
 )

@ -94,14 +94,13 @@ print(f"running garage {status.garage_version}, node_id {status.node}")

 # Change layout of this node
 current = layout.get_layout()
-layout.add_layout([
-  NodeRoleChange(
-    id = status.node,
+layout.add_layout({
+  status.node: NodeClusterInfo(
    zone = "dc1",
-    capacity = 1000000000,
+    capacity = 1,
    tags = [ "dev" ],
  )
-])
+})
 layout.apply_layout(LayoutVersion(
  version = current.version + 1
 ))
--- a/doc/book/cookbook/exposing-websites.md
+++ b/doc/book/cookbook/exposing-websites.md
@ -38,7 +38,7 @@ Our website serving logic is as follow:

 Now we need to infer the URL of your website through your bucket name.
 Let assume:
-  - we set `root_domain = ".web.example.com"` in `garage.toml` ([ref](@/documentation/reference-manual/configuration.md#web_root_domain))
+  - we set `root_domain = ".web.example.com"` in `garage.toml` ([ref](@/documentation/reference-manual/configuration.md#root_domain))
  - our bucket name is `garagehq.deuxfleurs.fr`.

 Our bucket will be served if the Host field matches one of these 2 values (the port is ignored):
--- a/doc/book/operations/layout.md
+++ b/doc/book/operations/layout.md
@ -12,7 +12,7 @@ An introduction to building cluster layouts can be found in the [production depl
 In Garage, all of the data that can be stored in a given cluster is divided
 into slices which we call *partitions*. Each partition is stored by
 one or several nodes in the cluster
-(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication_mode)).
+(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication-mode)).
 The layout determines the correspondence between these partition,
 which exist on a logical level, and actual storage nodes.

--- a/doc/book/reference-manual/admin-api.md
+++ b/doc/book/reference-manual/admin-api.md
@ -13,11 +13,8 @@ We will bump the version numbers prefixed to each API endpoint at each time the
 or semantics change, meaning that code that relies on these endpoint will break
 when changes are introduced.

-Versions:
- - Before Garage 0.7.2 - no admin API
- - Garage 0.7.2 - admin APIv0
- - Garage 0.9.0 - admin APIv1, deprecate admin APIv0
-
+The Garage administration API was introduced in version 0.7.2, this document
+does not apply to older versions of Garage.


 ## Access control
@ -134,9 +131,7 @@ $ curl -so /dev/null -w "%{http_code}" http://localhost:3903/check?domain=exampl

 ### Cluster operations

-These endpoints have a dedicated OpenAPI spec.
- - APIv1 - [HTML spec](https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html) - [OpenAPI YAML](https://garagehq.deuxfleurs.fr/api/garage-admin-v1.yml)
- - APIv0 (deprecated) - [HTML spec](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.html) - [OpenAPI YAML](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.yml)
+These endpoints are defined on a dedicated [Redocly page](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.html). You can also download its [OpenAPI specification](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.yml).

 Requesting the API from the command line can be as simple as running:

--- a/doc/book/reference-manual/configuration.md
+++ b/doc/book/reference-manual/configuration.md
@ -8,8 +8,6 @@ weight = 20
 Here is an example `garage.toml` configuration file that illustrates all of the possible options:

 ```toml
-replication_mode = "3"
-
 metadata_dir = "/var/lib/garage/meta"
 data_dir = "/var/lib/garage/data"
 metadata_fsync = true
@ -23,6 +21,8 @@ sled_cache_capacity = "128MiB"
 sled_flush_every_ms = 2000
 lmdb_map_size = "1T"

+replication_mode = "3"
+
 compression_level = 1

 rpc_secret = "4425f5c26c5e11581d3223904324dcb5b5d5dfb14e5e7f35e38c595424f5f1e6"
@ -77,64 +77,157 @@ The following gives details about each available configuration option.

 ## Available configuration options

-### Index
+### `metadata_dir`

-Top-level configuration options:
-[`block_size`](#block_size),
-[`bootstrap_peers`](#bootstrap_peers),
-[`compression_level`](#compression_level),
-[`data_dir`](#metadata_dir),
-[`data_fsync`](#data_fsync),
-[`db_engine`](#db_engine),
-[`lmdb_map_size`](#lmdb_map_size),
-[`metadata_dir`](#metadata_dir),
-[`metadata_fsync`](#metadata_fsync),
-[`replication_mode`](#replication_mode),
-[`rpc_bind_addr`](#rpc_bind_addr),
-[`rpc_public_addr`](#rpc_public_addr),
-[`rpc_secret`](#rpc_secret),
-[`rpc_secret_file`](#rpc_secret),
-[`sled_cache_capacity`](#sled_cache_capacity),
-[`sled_flush_every_ms`](#sled_flush_every_ms).
+The directory in which Garage will store its metadata. This contains the node identifier,
+the network configuration and the peer list, the list of buckets and keys as well
+as the index of all objects, object version and object blocks.

-The `[consul_discovery]` section:
-[`api`](#consul_api),
-[`ca_cert`](#consul_ca_cert),
-[`client_cert`](#consul_client_cert),
-[`client_key`](#consul_client_cert),
-[`consul_http_addr`](#consul_http_addr),
-[`meta`](#consul_tags),
-[`service_name`](#consul_service_name),
-[`tags`](#consul_tags),
-[`tls_skip_verify`](#consul_tls_skip_verify),
-[`token`](#consul_token).
+Store this folder on a fast SSD drive if possible to maximize Garage's performance.

-The `[kubernetes_discovery]` section:
-[`namespace`](#kube_namespace),
-[`service_name`](#kube_service_name),
-[`skip_crd`](#kube_skip_crd).
+### `data_dir`

-The `[s3_api]` section:
-[`api_bind_addr`](#s3_api_bind_addr),
-[`root_domain`](#s3_root_domain),
-[`s3_region`](#s3_region).
+The directory in which Garage will store the data blocks of objects.
+This folder can be placed on an HDD. The space available for `data_dir`
+should be counted to determine a node's capacity
+when [adding it to the cluster layout](@/documentation/cookbook/real-world.md).

-The `[s3_web]` section:
-[`bind_addr`](#web_bind_addr),
-[`root_domain`](#web_root_domain).
+Since `v0.9.0`, Garage supports multiple data directories with the following syntax:

-The `[admin]` section:
-[`api_bind_addr`](#admin_api_bind_addr),
-[`metrics_token`](#admin_metrics_token),
-[`metrics_token_file`](#admin_metrics_token),
-[`admin_token`](#admin_token),
-[`admin_token_file`](#admin_token),
-[`trace_sink`](#admin_trace_sink),
+```toml
+data_dir = [
+    { path = "/path/to/old_data", read_only = true },
+    { path = "/path/to/new_hdd1", capacity = "2T" },
+    { path = "/path/to/new_hdd2", capacity = "4T" },
+]
+```

+See [the dedicated documentation page](@/documentation/operations/multi-hdd.md)
+on how to operate Garage in such a setup.

-### Top-level configuration options
+### `db_engine` (since `v0.8.0`)

-#### `replication_mode` {#replication_mode}
+By default, Garage uses the Sled embedded database library
+to store its metadata on-disk. Since `v0.8.0`, Garage can use alternative storage backends as follows:
+
+| DB engine | `db_engine` value | Database path |
+| --------- | ----------------- | ------------- |
+| [Sled](https://sled.rs) | `"sled"` | `<metadata_dir>/db/` |
+| [LMDB](https://www.lmdb.tech) | `"lmdb"` | `<metadata_dir>/db.lmdb/` |
+| [Sqlite](https://sqlite.org) | `"sqlite"` | `<metadata_dir>/db.sqlite` |
+
+Performance characteristics of the different DB engines are as follows:
+
+- Sled: the default database engine, which tends to produce
+  large data files and also has performance issues, especially when the metadata folder
+  is on a traditional HDD and not on SSD.
+- LMDB: the recommended alternative on 64-bit systems,
+  much more space-efficiant and slightly faster. Note that the data format of LMDB is not portable
+  between architectures, so for instance the Garage database of an x86-64
+  node cannot be moved to an ARM64 node. Also note that, while LMDB can technically be used on 32-bit systems,
+  this will limit your node to very small database sizes due to how LMDB works; it is therefore not recommended.
+- Sqlite: Garage supports Sqlite as a storage backend for metadata,
+  however it may have issues and is also very slow in its current implementation,
+  so it is not recommended to be used for now.
+
+It is possible to convert Garage's metadata directory from one format to another with a small utility named `convert_db`,
+which can be downloaded at the following locations:
+[for amd64](https://garagehq.deuxfleurs.fr/_releases/convert_db/amd64/convert_db),
+[for i386](https://garagehq.deuxfleurs.fr/_releases/convert_db/i386/convert_db),
+[for arm64](https://garagehq.deuxfleurs.fr/_releases/convert_db/arm64/convert_db),
+[for arm](https://garagehq.deuxfleurs.fr/_releases/convert_db/arm/convert_db).
+The `convert_db` utility is used as folows:
+
+```
+convert-db -a <input db engine> -i <input db path> \
+  		   -b <output db engine> -o <output db path>
+```
+
+Make sure to specify the full database path as presented in the table above,
+and not just the path to the metadata directory.
+
+### `metadata_fsync`
+
+Whether to enable synchronous mode for the database engine or not.
+This is disabled (`false`) by default.
+
+This reduces the risk of metadata corruption in case of power failures,
+at the cost of a significant drop in write performance,
+as Garage will have to pause to sync data to disk much more often
+(several times for API calls such as PutObject).
+
+Using this option reduces the risk of simultaneous metadata corruption on several
+cluster nodes, which could lead to data loss.
+
+If multi-site replication is used, this option is most likely not necessary, as
+it is extremely unlikely that two nodes in different locations will have a 
+power failure at the exact same time.
+
+(Metadata corruption on a single node is not an issue, the corrupted data file
+can always be deleted and reconstructed from the other nodes in the cluster.)
+
+Here is how this option impacts the different database engines:
+
+| Database | `metadata_fsync = false` (default) | `metadata_fsync = true`       |
+|----------|------------------------------------|-------------------------------|
+| Sled     | default options                    | *unsupported*                 |
+| Sqlite   | `PRAGMA synchronous = OFF`         | `PRAGMA synchronous = NORMAL` |
+| LMDB     | `MDB_NOMETASYNC` + `MDB_NOSYNC`    | `MDB_NOMETASYNC`              |
+
+Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`).
+
+### `data_fsync`
+
+Whether to `fsync` data blocks and their containing directory after they are
+saved to disk.
+This is disabled (`false`) by default.
+
+This might reduce the risk that a data block is lost in rare
+situations such as simultaneous node losing power,
+at the cost of a moderate drop in write performance.
+
+Similarly to `metatada_fsync`, this is likely not necessary
+if geographical replication is used.
+
+### `block_size`
+
+Garage splits stored objects in consecutive chunks of size `block_size`
+(except the last one which might be smaller). The default size is 1MiB and
+should work in most cases. We recommend increasing it to e.g. 10MiB if
+you are using Garage to store large files and have fast network connections
+between all nodes (e.g. 1gbps).
+
+If you are interested in tuning this, feel free to do so (and remember to
+report your findings to us!). When this value is changed for a running Garage
+installation, only files newly uploaded will be affected. Previously uploaded
+files will remain available. This however means that chunks from existing files
+will not be deduplicated with chunks from newly uploaded files, meaning you
+might use more storage space that is optimally possible.
+
+### `sled_cache_capacity`
+
+This parameter can be used to tune the capacity of the cache used by
+[sled](https://sled.rs), the database Garage uses internally to store metadata.
+Tune this to fit the RAM you wish to make available to your Garage instance.
+This value has a conservative default (128MB) so that Garage doesn't use too much
+RAM by default, but feel free to increase this for higher performance.
+
+### `sled_flush_every_ms`
+
+This parameters can be used to tune the flushing interval of sled.
+Increase this if sled is thrashing your SSD, at the risk of losing more data in case
+of a power outage (though this should not matter much as data is replicated on other
+nodes). The default value, 2000ms, should be appropriate for most use cases.
+
+### `lmdb_map_size`
+
+This parameters can be used to set the map size used by LMDB,
+which is the size of the virtual memory region used for mapping the database file.
+The value of this parameter is the maximum size the metadata database can take.
+This value is not bound by the physical RAM size of the machine running Garage.
+If not specified, it defaults to 1GiB on 32-bit machines and 1TiB on 64-bit machines.
+
+### `replication_mode`

 Garage supports the following replication modes:

@ -217,160 +310,7 @@ to the cluster while rebalancing is in progress.  In theory, no data should be
 lost as rebalancing is a routine operation for Garage, although we cannot
 guarantee you that everything will go right in such an extreme scenario.

-#### `metadata_dir` {#metadata_dir}
-
-The directory in which Garage will store its metadata. This contains the node identifier,
-the network configuration and the peer list, the list of buckets and keys as well
-as the index of all objects, object version and object blocks.
-
-Store this folder on a fast SSD drive if possible to maximize Garage's performance.
-
-#### `data_dir` {#data_dir}
-
-The directory in which Garage will store the data blocks of objects.
-This folder can be placed on an HDD. The space available for `data_dir`
-should be counted to determine a node's capacity
-when [adding it to the cluster layout](@/documentation/cookbook/real-world.md).
-
-Since `v0.9.0`, Garage supports multiple data directories with the following syntax:
-
-```toml
-data_dir = [
-    { path = "/path/to/old_data", read_only = true },
-    { path = "/path/to/new_hdd1", capacity = "2T" },
-    { path = "/path/to/new_hdd2", capacity = "4T" },
-]
-```
-
-See [the dedicated documentation page](@/documentation/operations/multi-hdd.md)
-on how to operate Garage in such a setup.
-
-#### `db_engine` (since `v0.8.0`) {#db_engine}
-
-Since `v0.8.0`, Garage can use alternative storage backends as follows:
-
-| DB engine | `db_engine` value | Database path |
-| --------- | ----------------- | ------------- |
-| [LMDB](https://www.lmdb.tech) (default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` |
-| [Sled](https://sled.rs) (default up to `v0.8.0`) | `"sled"` | `<metadata_dir>/db/` |
-| [Sqlite](https://sqlite.org) | `"sqlite"` | `<metadata_dir>/db.sqlite` |
-
-Sled was the only database engine up to Garage v0.7.0. Performance issues and
-API limitations of Sled prompted the addition of alternative engines in v0.8.0.
-Since v0.9.0, LMDB is the default engine instead of Sled, and Sled is
-deprecated. We plan to remove Sled in Garage v1.0.
-
-Performance characteristics of the different DB engines are as follows:
-
- Sled: tends to produce large data files and also has performance issues,
-  especially when the metadata folder is on a traditional HDD and not on SSD.
-
- LMDB: the recommended database engine on 64-bit systems, much more
-  space-efficient and slightly faster. Note that the data format of LMDB is not
-  portable between architectures, so for instance the Garage database of an
-  x86-64 node cannot be moved to an ARM64 node. Also note that, while LMDB can
-  technically be used on 32-bit systems, this will limit your node to very
-  small database sizes due to how LMDB works; it is therefore not recommended.
-
- Sqlite: Garage supports Sqlite as an alternative storage backend for
-  metadata, and although it has not been tested as much, it is expected to work
-  satisfactorily.  Since Garage v0.9.0, performance issues have largely been
-  fixed by allowing for a no-fsync mode (see `metadata_fsync`). Sqlite does not
-  have the database size limitation of LMDB on 32-bit systems.
-
-It is possible to convert Garage's metadata directory from one format to another
-using the `garage convert-db` command, which should be used as follows:
-
-```
-garage convert-db -a <input db engine> -i <input db path> \
-                  -b <output db engine> -o <output db path>
-```
-
-Make sure to specify the full database path as presented in the table above
-(third colummn), and not just the path to the metadata directory.
-
-#### `metadata_fsync` {#metadata_fsync}
-
-Whether to enable synchronous mode for the database engine or not.
-This is disabled (`false`) by default.
-
-This reduces the risk of metadata corruption in case of power failures,
-at the cost of a significant drop in write performance,
-as Garage will have to pause to sync data to disk much more often
-(several times for API calls such as PutObject).
-
-Using this option reduces the risk of simultaneous metadata corruption on several
-cluster nodes, which could lead to data loss.
-
-If multi-site replication is used, this option is most likely not necessary, as
-it is extremely unlikely that two nodes in different locations will have a 
-power failure at the exact same time.
-
-(Metadata corruption on a single node is not an issue, the corrupted data file
-can always be deleted and reconstructed from the other nodes in the cluster.)
-
-Here is how this option impacts the different database engines:
-
-| Database | `metadata_fsync = false` (default) | `metadata_fsync = true`       |
-|----------|------------------------------------|-------------------------------|
-| Sled     | default options                    | *unsupported*                 |
-| Sqlite   | `PRAGMA synchronous = OFF`         | `PRAGMA synchronous = NORMAL` |
-| LMDB     | `MDB_NOMETASYNC` + `MDB_NOSYNC`    | `MDB_NOMETASYNC`              |
-
-Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`).
-
-#### `data_fsync` {#data_fsync}
-
-Whether to `fsync` data blocks and their containing directory after they are
-saved to disk.
-This is disabled (`false`) by default.
-
-This might reduce the risk that a data block is lost in rare
-situations such as simultaneous node losing power,
-at the cost of a moderate drop in write performance.
-
-Similarly to `metatada_fsync`, this is likely not necessary
-if geographical replication is used.
-
-#### `block_size` {#block_size}
-
-Garage splits stored objects in consecutive chunks of size `block_size`
-(except the last one which might be smaller). The default size is 1MiB and
-should work in most cases. We recommend increasing it to e.g. 10MiB if
-you are using Garage to store large files and have fast network connections
-between all nodes (e.g. 1gbps).
-
-If you are interested in tuning this, feel free to do so (and remember to
-report your findings to us!). When this value is changed for a running Garage
-installation, only files newly uploaded will be affected. Previously uploaded
-files will remain available. This however means that chunks from existing files
-will not be deduplicated with chunks from newly uploaded files, meaning you
-might use more storage space that is optimally possible.
-
-#### `sled_cache_capacity` {#sled_cache_capacity}
-
-This parameter can be used to tune the capacity of the cache used by
-[sled](https://sled.rs), the database Garage uses internally to store metadata.
-Tune this to fit the RAM you wish to make available to your Garage instance.
-This value has a conservative default (128MB) so that Garage doesn't use too much
-RAM by default, but feel free to increase this for higher performance.
-
-#### `sled_flush_every_ms` {#sled_flush_every_ms}
-
-This parameters can be used to tune the flushing interval of sled.
-Increase this if sled is thrashing your SSD, at the risk of losing more data in case
-of a power outage (though this should not matter much as data is replicated on other
-nodes). The default value, 2000ms, should be appropriate for most use cases.
-
-#### `lmdb_map_size` {#lmdb_map_size}
-
-This parameters can be used to set the map size used by LMDB,
-which is the size of the virtual memory region used for mapping the database file.
-The value of this parameter is the maximum size the metadata database can take.
-This value is not bound by the physical RAM size of the machine running Garage.
-If not specified, it defaults to 1GiB on 32-bit machines and 1TiB on 64-bit machines.
-
-#### `compression_level` {#compression_level}
+### `compression_level`

 Zstd compression level to use for storing blocks.

@ -394,7 +334,7 @@ Compression is done synchronously, setting a value too high will add latency to
 This value can be different between nodes, compression is done by the node which receive the
 API call.

-#### `rpc_secret`, `rpc_secret_file` or `GARAGE_RPC_SECRET` (env) {#rpc_secret}
+### `rpc_secret`, `rpc_secret_file` or `GARAGE_RPC_SECRET` (env)

 Garage uses a secret key, called an RPC secret, that is shared between all
 nodes of the cluster in order to identify these nodes and allow them to
@ -406,7 +346,7 @@ Since Garage `v0.8.2`, the RPC secret can also be stored in a file whose path is
 given in the configuration variable `rpc_secret_file`, or specified as an
 environment variable `GARAGE_RPC_SECRET`.

-#### `rpc_bind_addr` {#rpc_bind_addr}
+### `rpc_bind_addr`

 The address and port on which to bind for inter-cluster communcations
 (reffered to as RPC for remote procedure calls).
@ -415,14 +355,14 @@ the node, even in the case of a NAT: the NAT should be configured to forward the
 port number to the same internal port nubmer. This means that if you have several nodes running
 behind a NAT, they should each use a different RPC port number.

-#### `rpc_public_addr` {#rpc_public_addr}
+### `rpc_public_addr`

 The address and port that other nodes need to use to contact this node for
 RPC calls.  **This parameter is optional but recommended.** In case you have
 a NAT that binds the RPC port to a port that is different on your public IP,
 this field might help making it work.

-#### `bootstrap_peers` {#bootstrap_peers}
+### `bootstrap_peers`

 A list of peer identifiers on which to contact other Garage peers of this cluster.
 These peer identifiers have the following syntax:
@ -439,42 +379,42 @@ key will be returned by `garage node id` and you will have to add the IP
 yourself.


-### The `[consul_discovery]` section
+## The `[consul_discovery]` section

 Garage supports discovering other nodes of the cluster using Consul.  For this
 to work correctly, nodes need to know their IP address by which they can be
 reached by other nodes of the cluster, which should be set in `rpc_public_addr`.

-#### `consul_http_addr` {#consul_http_addr}
+### `consul_http_addr` and `service_name`

 The `consul_http_addr` parameter should be set to the full HTTP(S) address of the Consul server.

-#### `api` {#consul_api}
+### `api`

 Two APIs for service registration are supported: `catalog`  and `agent`. `catalog`, the default, will register a service using
 the `/v1/catalog` endpoints, enabling mTLS if `client_cert` and `client_key` are provided. The `agent` API uses the
 `v1/agent` endpoints instead, where an optional `token` may be provided.

-#### `service_name` {#consul_service_name}
+### `service_name`

 `service_name` should be set to the service name under which Garage's
 RPC ports are announced.

-#### `client_cert`, `client_key` {#consul_client_cert}
+### `client_cert`, `client_key`

 TLS client certificate and client key to use when communicating with Consul over TLS. Both are mandatory when doing so.
 Only available when `api = "catalog"`.

-#### `ca_cert` {#consul_ca_cert}
+### `ca_cert`

 TLS CA certificate to use when communicating with Consul over TLS.

-#### `tls_skip_verify` {#consul_tls_skip_verify}
+### `tls_skip_verify`

 Skip server hostname verification in TLS handshake.
 `ca_cert` is ignored when this is set.

-#### `token` {#consul_token}
+### `token`

 Uses the provided token for communication with Consul. Only available when `api = "agent"`.
 The policy assigned to this token should at least have these rules:
@ -494,49 +434,49 @@ node_prefix "" {
 }
 ```

-#### `tags` and `meta` {#consul_tags}
+### `tags` and `meta`

 Additional list of tags and map of service meta to add during service registration.

-### The `[kubernetes_discovery]` section
+## The `[kubernetes_discovery]` section

 Garage supports discovering other nodes of the cluster using kubernetes custom
 resources. For this to work, a `[kubernetes_discovery]` section must be present
 with at least the `namespace` and `service_name` parameters.

-#### `namespace` {#kube_namespace}
+### `namespace`

 `namespace` sets the namespace in which the custom resources are
 configured.

-#### `service_name` {#kube_service_name}
+### `service_name`

 `service_name` is added as a label to the advertised resources to
 filter them, to allow for multiple deployments in a single namespace.

-#### `skip_crd` {#kube_skip_crd}
+### `skip_crd`

 `skip_crd` can be set to true to disable the automatic creation and
 patching of the `garagenodes.deuxfleurs.fr` CRD. You will need to create the CRD
 manually.


-### The `[s3_api]` section
+## The `[s3_api]` section

-#### `api_bind_addr` {#s3_api_bind_addr}
+### `api_bind_addr`

 The IP and port on which to bind for accepting S3 API calls.
 This endpoint does not suport TLS: a reverse proxy should be used to provide it.

 Alternatively, since `v0.8.5`, a path can be used to create a unix socket with 0222 mode.

-#### `s3_region` {#s3_region}
+### `s3_region`

 Garage will accept S3 API calls that are targetted to the S3 region defined here.
 API calls targetted to other regions will fail with a AuthorizationHeaderMalformed error
 message that redirects the client to the correct region.

-#### `root_domain` {#s3_root_domain}
+### `root_domain` {#root_domain}

 The optional suffix to access bucket using vhost-style in addition to path-style request.
 Note path-style requests are always enabled, whether or not vhost-style is configured.
@ -548,12 +488,12 @@ using the hostname `my-bucket.s3.garage.eu`.



-### The `[s3_web]` section
+## The `[s3_web]` section

 Garage allows to publish content of buckets as websites. This section configures the
 behaviour of this module.

-#### `bind_addr` {#web_bind_addr}
+### `bind_addr`

 The IP and port on which to bind for accepting HTTP requests to buckets configured
 for website access.
@ -561,7 +501,7 @@ This endpoint does not suport TLS: a reverse proxy should be used to provide it.

 Alternatively, since `v0.8.5`, a path can be used to create a unix socket with 0222 mode.

-#### `root_domain` {#web_root_domain}
+### `root_domain`

 The optional suffix appended to bucket names for the corresponding HTTP Host.

@ -570,11 +510,11 @@ will be accessible either with hostname `deuxfleurs.fr.web.garage.eu`
 or with hostname `deuxfleurs.fr`.


-### The `[admin]` section
+## The `[admin]` section

 Garage has a few administration capabilities, in particular to allow remote monitoring. These features are detailed below.

-#### `api_bind_addr` {#admin_api_bind_addr}
+### `api_bind_addr`

 If specified, Garage will bind an HTTP server to this port and address, on
 which it will listen to requests for administration features.
@ -583,7 +523,7 @@ See [administration API reference](@/documentation/reference-manual/admin-api.md
 Alternatively, since `v0.8.5`, a path can be used to create a unix socket. Note that for security reasons,
 the socket will have 0220 mode. Make sure to set user and group permissions accordingly.

-#### `metrics_token`, `metrics_token_file` or `GARAGE_METRICS_TOKEN` (env) {#admin_metrics_token}
+### `metrics_token`, `metrics_token_file` or `GARAGE_METRICS_TOKEN` (env)

 The token for accessing the Metrics endpoint. If this token is not set, the
 Metrics endpoint can be accessed without access control.
@ -594,7 +534,7 @@ You can use any random string for this value. We recommend generating a random t
 `metrics_token_file` and the `GARAGE_METRICS_TOKEN` environment variable are supported since Garage `v0.8.2`.


-#### `admin_token`, `admin_token_file` or `GARAGE_ADMIN_TOKEN` (env) {#admin_token}
+### `admin_token`, `admin_token_file` or `GARAGE_ADMIN_TOKEN` (env)

 The token for accessing all of the other administration endpoints.  If this
 token is not set, access to these endpoints is disabled entirely.
@ -605,7 +545,7 @@ You can use any random string for this value. We recommend generating a random t
 `admin_token_file` and the `GARAGE_ADMIN_TOKEN` environment variable are supported since Garage `v0.8.2`.


-#### `trace_sink` {#admin_trace_sink}
+### `trace_sink`

 Optionally, the address of an OpenTelemetry collector.  If specified,
 Garage will send traces in the OpenTelemetry format to this endpoint. These
--- a/doc/book/reference-manual/features.md
+++ b/doc/book/reference-manual/features.md
@ -52,7 +52,7 @@ This is particularly usefull when nodes are far from one another and talk to one

 Garage supports a variety of replication modes, with 1 copy, 2 copies or 3 copies of your data,
 and with various levels of consistency, in order to adapt to a variety of usage scenarios.
-Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_mode)
+Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication-mode)
 to select the replication mode best suited to your use case (hint: in most cases, `replication_mode = "3"` is what you want).

 ### Web server for static websites
--- a/doc/drafts/admin-api.md
+++ b/doc/drafts/admin-api.md
@ -69,8 +69,8 @@ Example response body:

 ```json
 {
-  "node": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
-  "garageVersion": "git:v0.9.0-dev",
+  "node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
+  "garageVersion": "v0.10.0",
  "garageFeatures": [
    "k2v",
    "sled",
@ -81,83 +81,92 @@ Example response body:
  ],
  "rustVersion": "1.68.0",
  "dbEngine": "LMDB (using Heed crate)",
-  "knownNodes": [
+  "layoutVersion": 5,
+  "nodes": [
    {
-      "id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
-      "addr": "10.0.0.11:3901",
-      "isUp": true,
-      "lastSeenSecsAgo": 9,
-      "hostname": "node1"
-    },
-    {
-      "id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
-      "addr": "10.0.0.12:3901",
-      "isUp": true,
-      "lastSeenSecsAgo": 1,
-      "hostname": "node2"
-    },
-    {
-      "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
-      "addr": "10.0.0.21:3901",
-      "isUp": true,
-      "lastSeenSecsAgo": 7,
-      "hostname": "node3"
-    },
-    {
-      "id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
-      "addr": "10.0.0.22:3901",
-      "isUp": true,
-      "lastSeenSecsAgo": 1,
-      "hostname": "node4"
-    }
-  ],
-  "layout": {
-    "version": 12,
-    "roles": [
-      {
-        "id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f",
+      "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
+      "role": {
+        "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c",
        "zone": "dc1",
-        "capacity": 10737418240,
-        "tags": [
-          "node1"
-        ]
+        "capacity": 100000000000,
+        "tags": []
+      },
+      "addr": "10.0.0.3:3901",
+      "hostname": "node3",
+      "isUp": true,
+      "lastSeenSecsAgo": 12,
+      "draining": false,
+      "dataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
+      },
+      "metadataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
+      }
    },
    {
-        "id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff",
+      "id": "a11c7cf18af297379eff8688360155fe68d9061654449ba0ce239252f5a7487f",
+      "role": null,
+      "addr": "10.0.0.2:3901",
+      "hostname": "node2",
+      "isUp": true,
+      "lastSeenSecsAgo": 11,
+      "draining": true,
+      "dataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
+      },
+      "metadataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
+      }
+    },
+    {
+      "id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
+      "role": {
+        "id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2",
        "zone": "dc1",
-        "capacity": 10737418240,
-        "tags": [
-          "node2"
-        ]
+        "capacity": 100000000000,
+        "tags": []
+      },
+      "addr": "127.0.0.1:3904",
+      "hostname": "lindy",
+      "isUp": true,
+      "lastSeenSecsAgo": 2,
+      "draining": false,
+      "dataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
+      },
+      "metadataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
+      }
    },
    {
-        "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
-        "zone": "dc2",
-        "capacity": 10737418240,
-        "tags": [
-          "node3"
-        ]
+      "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
+      "role": {
+        "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
+        "zone": "dc1",
+        "capacity": 100000000000,
+        "tags": []
+      },
+      "addr": "10.0.0.1:3901",
+      "hostname": "node1",
+      "isUp": true,
+      "lastSeenSecsAgo": 3,
+      "draining": false,
+      "dataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
+      },
+      "metadataPartition": {
+        "available": 660270088192,
+        "total": 873862266880
      }
-    ],
-    "stagedRoleChanges": [
-      {
-        "id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b",
-        "remove": false,
-        "zone": "dc2",
-        "capacity": 10737418240,
-        "tags": [
-          "node4"
-        ]
-      }
-      {
-        "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27",
-        "remove": true,
-        "zone": null,
-        "capacity": null,
-        "tags": null,
    }
  ]
-  }
 }
 ```

--- a/flake.lock
+++ b/flake.lock
@ -1,5 +1,31 @@
 {
  "nodes": {
+    "cargo2nix": {
+      "inputs": {
+        "flake-compat": [
+          "flake-compat"
+        ],
+        "flake-utils": "flake-utils",
+        "nixpkgs": [
+          "nixpkgs"
+        ],
+        "rust-overlay": "rust-overlay"
+      },
+      "locked": {
+        "lastModified": 1666087781,
+        "narHash": "sha256-trKVdjMZ8mNkGfLcY5LsJJGtdV3xJDZnMVrkFjErlcs=",
+        "owner": "Alexis211",
+        "repo": "cargo2nix",
+        "rev": "a7a61179b66054904ef6a195d8da736eaaa06c36",
+        "type": "github"
+      },
+      "original": {
+        "owner": "Alexis211",
+        "repo": "cargo2nix",
+        "rev": "a7a61179b66054904ef6a195d8da736eaaa06c36",
+        "type": "github"
+      }
+    },
    "flake-compat": {
      "locked": {
        "lastModified": 1688025799,
@ -20,19 +46,54 @@
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1701680307,
-        "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
+        "lastModified": 1681202837,
+        "narHash": "sha256-H+Rh19JDwRtpVPAWp64F+rlEtxUWBAQW28eAi3SRSzg=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
+        "rev": "cfacdce06f30d2b68473a46042957675eebb3401",
        "type": "github"
      },
      "original": {
-        "id": "flake-utils",
-        "type": "indirect"
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1681202837,
+        "narHash": "sha256-H+Rh19JDwRtpVPAWp64F+rlEtxUWBAQW28eAi3SRSzg=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "cfacdce06f30d2b68473a46042957675eebb3401",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
      }
    },
    "nixpkgs": {
+      "locked": {
+        "lastModified": 1682109806,
+        "narHash": "sha256-d9g7RKNShMLboTWwukM+RObDWWpHKaqTYXB48clBWXI=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "2362848adf8def2866fabbffc50462e929d7fffb",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixpkgs-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs_2": {
      "locked": {
        "lastModified": 1682423271,
        "narHash": "sha256-WHhl1GiOij1ob4cTLL+yhqr+vFOUH8E5wAX8Ir8fvjE=",
@ -50,9 +111,32 @@
    },
    "root": {
      "inputs": {
+        "cargo2nix": "cargo2nix",
        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
+        "flake-utils": [
+          "cargo2nix",
+          "flake-utils"
+        ],
+        "nixpkgs": "nixpkgs_2"
+      }
+    },
+    "rust-overlay": {
+      "inputs": {
+        "flake-utils": "flake-utils_2",
        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1682389182,
+        "narHash": "sha256-8t2nmFnH+8V48+IJsf8AK51ebXNlVbOSVYOpiqJKvJE=",
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "rev": "74f1a64dd28faeeb85ef081f32cad2989850322c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "type": "github"
      }
    },
    "systems": {
@ -69,6 +153,21 @@
        "repo": "default",
        "type": "github"
      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -8,23 +8,50 @@

  inputs.flake-compat.url = "github:nix-community/flake-compat";

-  outputs = { self, nixpkgs, flake-utils, ... }:
+  inputs.cargo2nix = {
+    # As of 2022-10-18: two small patches over unstable branch, one for clippy and one to fix feature detection
+    url = "github:Alexis211/cargo2nix/a7a61179b66054904ef6a195d8da736eaaa06c36";
+
+    # As of 2023-04-25:
+    # - my two patches were merged into unstable (one for clippy and one to "fix" feature detection)
+    # - rustc v1.66
+    # url = "github:cargo2nix/cargo2nix/8fb57a670f7993bfc24099c33eb9c5abb51f29a2";
+
+    # Rust overlay as of 2023-04-25
+    inputs.rust-overlay.url =
+      "github:oxalica/rust-overlay/74f1a64dd28faeeb85ef081f32cad2989850322c";
+
+    inputs.nixpkgs.follows = "nixpkgs";
+    inputs.flake-compat.follows = "flake-compat";
+  };
+
+  inputs.flake-utils.follows = "cargo2nix/flake-utils";
+
+  outputs = { self, nixpkgs, cargo2nix, flake-utils, ... }:
    let
-      gitVersion = self.lastModifiedDate;
+      git_version = self.lastModifiedDate;
      compile = import ./nix/compile.nix;
    in
    flake-utils.lib.eachDefaultSystem (system:
      let pkgs = nixpkgs.legacyPackages.${system};
      in {
        packages = {
-          default = (import ./default.nix {
-            inherit gitVersion;
-            buildSystem = system;
+          default = (compile {
+            inherit system git_version;
+            pkgsSrc = nixpkgs;
+            cargo2nixOverlay = cargo2nix.overlays.default;
            release = true;
-          }).build;
+          }).workspace.garage { compileMode = "build"; };
        };
-        devShell = (import ./shell.nix {
-          buildSystem = system;
-        }).rust;
+        devShell = (compile {
+          inherit system git_version;
+          pkgsSrc = nixpkgs;
+          cargo2nixOverlay = cargo2nix.overlays.default;
+          release = false;
+        }).workspaceShell { packages = with pkgs; [
+          rustfmt
+          clang
+          mold
+        ]; };
      });
 }
--- a/nix/build_index.nix
+++ b/nix/build_index.nix
@ -1,6 +1,6 @@
 { path ? "/../aws-list.txt", }:

-with import ./pkgs.nix;
+with import ./common.nix;
 let
  pkgs = import pkgsSrc { };
  lib = pkgs.lib;
--- a/nix/common.nix
+++ b/nix/common.nix
@ -0,0 +1,17 @@
+let
+  lock = builtins.fromJSON (builtins.readFile ../flake.lock);
+
+  inherit (lock.nodes.flake-compat.locked) owner repo rev narHash;
+
+  flake-compat = fetchTarball {
+    url = "https://github.com/${owner}/${repo}/archive/${rev}.tar.gz";
+    sha256 = narHash;
+  };
+
+  flake = (import flake-compat { system = builtins.currentSystem; src = ../.; });
+in
+rec {
+  pkgsSrc = flake.defaultNix.inputs.nixpkgs;
+  cargo2nix = flake.defaultNix.inputs.cargo2nix;
+  cargo2nixOverlay = cargo2nix.overlays.default;
+}
--- a/nix/pkgs.nix
+++ b/nix/pkgs.nix
@ -1,8 +0,0 @@
-let
-  lock = builtins.fromJSON (builtins.readFile ../flake.lock);
-  inherit (lock.nodes.nixpkgs.locked) owner repo rev narHash;
-in
-  fetchTarball {
-    url = "https://github.com/${owner}/${repo}/archive/${rev}.tar.gz";
-    sha256 = narHash;
-  }
--- a/shell.nix
+++ b/shell.nix
@ -1,12 +1,12 @@
-{
-  buildSystem ? builtins.currentSystem,
-  targetSystem ? buildSystem,
-}:
+{ system ? builtins.currentSystem, }:

-with import ./nix/pkgs.nix;
+with import ./nix/common.nix;

 let
-  inherit (import ./default.nix { inherit buildSystem targetSystem; }) pkgs pkgsCross buildEnv;
+  pkgs = import pkgsSrc {
+    inherit system;
+    overlays = [ cargo2nixOverlay ];
+  };
  kaniko = (import ./nix/kaniko.nix) pkgs;
  manifest-tool = (import ./nix/manifest-tool.nix) pkgs;
  winscp = (import ./nix/winscp.nix) pkgs;
@ -14,13 +14,22 @@ let
 in {
  # --- Rust Shell ---
  # Use it to compile Garage
-  rust = pkgsCross.mkShell (buildEnv // {
-    inputsFrom = [
-      kaniko
-      manifest-tool
-      winscp
+  rust = pkgs.mkShell {
+    nativeBuildInputs = with pkgs; [
+      #rustPlatform.rust.rustc
+      rustPlatform.rust.cargo
+      clang
+      mold
+      #clippy
+      rustfmt
+      #perl
+      #protobuf
+      #pkg-config
+      #openssl
+      file
+      #cargo2nix.packages.x86_64-linux.cargo2nix
    ];
-  });
+  };

  # --- Integration shell ---
  # Use it to test Garage with common S3 clients
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@ -45,7 +45,7 @@ http = "0.2"
 httpdate = "1.0"
 http-range = "0.1"
 hyper = { version = "0.14", features = ["server", "http1", "runtime", "tcp", "stream"] }
-#hyperlocal = { version = "0.8.0", default-features = false, features = ["server"] }
+hyperlocal = { version = "0.8.0", default-features = false, features = ["server"] }
 multer = "2.0"
 percent-encoding = "2.1.0"
 roxmltree = "0.18"
--- a/src/api/admin/api_server.rs
+++ b/src/api/admin/api_server.rs
@ -182,7 +182,7 @@ impl AdminApiServer {
 			),
 		};
 		let status_str = format!(
-			"{}\nConsult the full health check API endpoint at /v1/health for more details\n",
+			"{}\nConsult the full health check API endpoint at /v0/health for more details\n",
 			status_str
 		);

@ -279,7 +279,7 @@ impl ApiHandler for AdminApiServer {
 			Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await,
 			Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await,
 			Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await,
-			Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await,
+			Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage).await,
 			// Keys
 			Endpoint::ListKeys => handle_list_keys(&self.garage).await,
 			Endpoint::GetKeyInfo {
--- a/src/api/admin/bucket.rs
+++ b/src/api/admin/bucket.rs
@ -122,7 +122,7 @@ async fn bucket_info_results(
 		.table
 		.get(&bucket_id, &EmptyKey)
 		.await?
-		.map(|x| x.filtered_values(&garage.system.ring.borrow()))
+		.map(|x| x.filtered_values(&garage.system.cluster_layout()))
 		.unwrap_or_default();

 	let mpu_counters = garage
@ -130,7 +130,7 @@ async fn bucket_info_results(
 		.table
 		.get(&bucket_id, &EmptyKey)
 		.await?
-		.map(|x| x.filtered_values(&garage.system.ring.borrow()))
+		.map(|x| x.filtered_values(&garage.system.cluster_layout()))
 		.unwrap_or_default();

 	let mut relevant_keys = HashMap::new();
--- a/src/api/admin/cluster.rs
+++ b/src/api/admin/cluster.rs
@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::net::SocketAddr;
 use std::sync::Arc;

@ -15,25 +16,95 @@ use crate::admin::error::*;
 use crate::helpers::{json_ok_response, parse_json_body};

 pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+	let layout = garage.system.cluster_layout();
+	let mut nodes = garage
+		.system
+		.get_known_nodes()
+		.into_iter()
+		.map(|i| {
+			(
+				i.id,
+				NodeResp {
+					id: hex::encode(i.id),
+					addr: Some(i.addr),
+					hostname: i.status.hostname,
+					is_up: i.is_up,
+					last_seen_secs_ago: i.last_seen_secs_ago,
+					data_partition: i
+						.status
+						.data_disk_avail
+						.map(|(avail, total)| FreeSpaceResp {
+							available: avail,
+							total,
+						}),
+					metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| {
+						FreeSpaceResp {
+							available: avail,
+							total,
+						}
+					}),
+					..Default::default()
+				},
+			)
+		})
+		.collect::<HashMap<_, _>>();
+
+	for (id, _, role) in layout.current().roles.items().iter() {
+		if let layout::NodeRoleV(Some(r)) = role {
+			let role = NodeRoleResp {
+				id: hex::encode(id),
+				zone: r.zone.to_string(),
+				capacity: r.capacity,
+				tags: r.tags.clone(),
+			};
+			match nodes.get_mut(id) {
+				None => {
+					nodes.insert(
+						*id,
+						NodeResp {
+							id: hex::encode(id),
+							role: Some(role),
+							..Default::default()
+						},
+					);
+				}
+				Some(n) => {
+					if n.role.is_none() {
+						n.role = Some(role);
+					}
+				}
+			}
+		}
+	}
+
+	for ver in layout.versions.iter().rev().skip(1) {
+		for (id, _, role) in ver.roles.items().iter() {
+			if let layout::NodeRoleV(Some(r)) = role {
+				if !nodes.contains_key(id) && r.capacity.is_some() {
+					nodes.insert(
+						*id,
+						NodeResp {
+							id: hex::encode(id),
+							draining: true,
+							..Default::default()
+						},
+					);
+				}
+			}
+		}
+	}
+
+	let mut nodes = nodes.into_values().collect::<Vec<_>>();
+	nodes.sort_by(|x, y| x.id.cmp(&y.id));
+
 	let res = GetClusterStatusResponse {
 		node: hex::encode(garage.system.id),
 		garage_version: garage_util::version::garage_version(),
 		garage_features: garage_util::version::garage_features(),
 		rust_version: garage_util::version::rust_version(),
 		db_engine: garage.db.engine(),
-		known_nodes: garage
-			.system
-			.get_known_nodes()
-			.into_iter()
-			.map(|i| KnownNodeResp {
-				id: hex::encode(i.id),
-				addr: i.addr,
-				is_up: i.is_up,
-				last_seen_secs_ago: i.last_seen_secs_ago,
-				hostname: i.status.hostname,
-			})
-			.collect(),
-		layout: format_cluster_layout(&garage.system.get_cluster_layout()),
+		layout_version: layout.current().version,
+		nodes,
 	};

 	Ok(json_ok_response(&res)?)
@ -84,13 +155,14 @@ pub async fn handle_connect_cluster_nodes(
 }

 pub async fn handle_get_cluster_layout(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
-	let res = format_cluster_layout(&garage.system.get_cluster_layout());
+	let res = format_cluster_layout(&garage.system.cluster_layout());

 	Ok(json_ok_response(&res)?)
 }

-fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResponse {
+fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResponse {
 	let roles = layout
+		.current()
 		.roles
 		.items()
 		.iter()
@ -104,10 +176,12 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp
 		.collect::<Vec<_>>();

 	let staged_role_changes = layout
-		.staging_roles
+		.staging
+		.get()
+		.roles
 		.items()
 		.iter()
-		.filter(|(k, _, v)| layout.roles.get(k) != Some(v))
+		.filter(|(k, _, v)| layout.current().roles.get(k) != Some(v))
 		.map(|(k, _, v)| match &v.0 {
 			None => NodeRoleChange {
 				id: hex::encode(k),
@ -125,7 +199,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp
 		.collect::<Vec<_>>();

 	GetClusterLayoutResponse {
-		version: layout.version,
+		version: layout.current().version,
 		roles,
 		staged_role_changes,
 	}
@ -154,8 +228,8 @@ struct GetClusterStatusResponse {
 	garage_features: Option<&'static [&'static str]>,
 	rust_version: &'static str,
 	db_engine: String,
-	known_nodes: Vec<KnownNodeResp>,
-	layout: GetClusterLayoutResponse,
+	layout_version: u64,
+	nodes: Vec<NodeResp>,
 }

 #[derive(Serialize)]
@ -189,14 +263,27 @@ struct NodeRoleResp {
 	tags: Vec<String>,
 }

-#[derive(Serialize)]
+#[derive(Serialize, Default)]
 #[serde(rename_all = "camelCase")]
-struct KnownNodeResp {
+struct FreeSpaceResp {
+	available: u64,
+	total: u64,
+}
+
+#[derive(Serialize, Default)]
+#[serde(rename_all = "camelCase")]
+struct NodeResp {
 	id: String,
-	addr: SocketAddr,
+	role: Option<NodeRoleResp>,
+	addr: Option<SocketAddr>,
+	hostname: Option<String>,
 	is_up: bool,
 	last_seen_secs_ago: Option<u64>,
-	hostname: String,
+	draining: bool,
+	#[serde(skip_serializing_if = "Option::is_none")]
+	data_partition: Option<FreeSpaceResp>,
+	#[serde(skip_serializing_if = "Option::is_none")]
+	metadata_partition: Option<FreeSpaceResp>,
 }

 // ---- update functions ----
@ -207,10 +294,10 @@ pub async fn handle_update_cluster_layout(
 ) -> Result<Response<Body>, Error> {
 	let updates = parse_json_body::<UpdateClusterLayoutRequest>(req).await?;

-	let mut layout = garage.system.get_cluster_layout();
+	let mut layout = garage.system.cluster_layout().clone();

-	let mut roles = layout.roles.clone();
-	roles.merge(&layout.staging_roles);
+	let mut roles = layout.current().roles.clone();
+	roles.merge(&layout.staging.get().roles);

 	for change in updates {
 		let node = hex::decode(&change.id).ok_or_bad_request("Invalid node identifier")?;
@ -231,11 +318,17 @@ pub async fn handle_update_cluster_layout(
 		};

 		layout
-			.staging_roles
+			.staging
+			.get_mut()
+			.roles
 			.merge(&roles.update_mutator(node, layout::NodeRoleV(new_role)));
 	}

-	garage.system.update_cluster_layout(&layout).await?;
+	garage
+		.system
+		.layout_manager
+		.update_cluster_layout(&layout)
+		.await?;

 	let res = format_cluster_layout(&layout);
 	Ok(json_ok_response(&res)?)
@ -245,12 +338,16 @@ pub async fn handle_apply_cluster_layout(
 	garage: &Arc<Garage>,
 	req: Request<Body>,
 ) -> Result<Response<Body>, Error> {
-	let param = parse_json_body::<ApplyRevertLayoutRequest>(req).await?;
+	let param = parse_json_body::<ApplyLayoutRequest>(req).await?;

-	let layout = garage.system.get_cluster_layout();
+	let layout = garage.system.cluster_layout().clone();
 	let (layout, msg) = layout.apply_staged_changes(Some(param.version))?;

-	garage.system.update_cluster_layout(&layout).await?;
+	garage
+		.system
+		.layout_manager
+		.update_cluster_layout(&layout)
+		.await?;

 	let res = ApplyClusterLayoutResponse {
 		message: msg,
@ -259,15 +356,14 @@ pub async fn handle_apply_cluster_layout(
 	Ok(json_ok_response(&res)?)
 }

-pub async fn handle_revert_cluster_layout(
-	garage: &Arc<Garage>,
-	req: Request<Body>,
-) -> Result<Response<Body>, Error> {
-	let param = parse_json_body::<ApplyRevertLayoutRequest>(req).await?;
-
-	let layout = garage.system.get_cluster_layout();
-	let layout = layout.revert_staged_changes(Some(param.version))?;
-	garage.system.update_cluster_layout(&layout).await?;
+pub async fn handle_revert_cluster_layout(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+	let layout = garage.system.cluster_layout().clone();
+	let layout = layout.revert_staged_changes()?;
+	garage
+		.system
+		.layout_manager
+		.update_cluster_layout(&layout)
+		.await?;

 	let res = format_cluster_layout(&layout);
 	Ok(json_ok_response(&res)?)
@ -279,7 +375,7 @@ type UpdateClusterLayoutRequest = Vec<NodeRoleChange>;

 #[derive(Deserialize)]
 #[serde(rename_all = "camelCase")]
-struct ApplyRevertLayoutRequest {
+struct ApplyLayoutRequest {
 	version: u64,
 }

--- a/src/api/common_error.rs
+++ b/src/api/common_error.rs
@ -53,9 +53,7 @@ impl CommonError {
 	pub fn http_status_code(&self) -> StatusCode {
 		match self {
 			CommonError::InternalError(
-				GarageError::Timeout
-				| GarageError::RemoteError(_)
-				| GarageError::Quorum(_, _, _, _),
+				GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..),
 			) => StatusCode::SERVICE_UNAVAILABLE,
 			CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
 				StatusCode::INTERNAL_SERVER_ERROR
@ -72,9 +70,7 @@ impl CommonError {
 		match self {
 			CommonError::Forbidden(_) => "AccessDenied",
 			CommonError::InternalError(
-				GarageError::Timeout
-				| GarageError::RemoteError(_)
-				| GarageError::Quorum(_, _, _, _),
+				GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..),
 			) => "ServiceUnavailable",
 			CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
 				"InternalError"
--- a/src/api/generic_server.rs
+++ b/src/api/generic_server.rs
@ -1,4 +1,5 @@
-//use std::fs::{self, Permissions};
+use std::fs::{self, Permissions};
+use std::os::unix::fs::PermissionsExt;
 use std::sync::Arc;

 use async_trait::async_trait;
@ -11,9 +12,9 @@ use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Request, Response, Server};
 use hyper::{HeaderMap, StatusCode};

-//use hyperlocal::UnixServerExt;
+use hyperlocal::UnixServerExt;

-//use tokio::net::UnixStream;
+use tokio::net::UnixStream;

 use opentelemetry::{
 	global,
@ -113,18 +114,18 @@ impl<A: ApiHandler> ApiServer<A> {
 			}
 		});

-		//let unix_service = make_service_fn(|_: &UnixStream| {
-		//	let this = self.clone();
+		let unix_service = make_service_fn(|_: &UnixStream| {
+			let this = self.clone();

-		//	let path = bind_addr.to_string();
-		//	async move {
-		//		Ok::<_, GarageError>(service_fn(move |req: Request<Body>| {
-		//			let this = this.clone();
+			let path = bind_addr.to_string();
+			async move {
+				Ok::<_, GarageError>(service_fn(move |req: Request<Body>| {
+					let this = this.clone();

-		//			this.handler(req, path.clone())
-		//		}))
-		//	}
-		//});
+					this.handler(req, path.clone())
+				}))
+			}
+		});

 		info!(
 			"{} API server listening on {}",
@ -139,24 +140,23 @@ impl<A: ApiHandler> ApiServer<A> {
 					.with_graceful_shutdown(shutdown_signal)
 					.await?
 			}
-			UnixOrTCPSocketAddress::UnixSocket(_path) => {
-				panic!("Unix sockets are not supported in this fork") // TODO(mediocregopher)
-			} //UnixOrTCPSocketAddress::UnixSocket(ref path) => {
-			  //    use std::os::unix::fs::PermissionsExt;
-			  //    remove_unix_socket_if_present(path).await?;
+			UnixOrTCPSocketAddress::UnixSocket(ref path) => {
+				if path.exists() {
+					fs::remove_file(path)?
+				}

-			  //	let bound = Server::bind_unix(path)?;
+				let bound = Server::bind_unix(path)?;

-			  //	fs::set_permissions(
-			  //		path,
-			  //		Permissions::from_mode(unix_bind_addr_mode.unwrap_or(0o222)),
-			  //	)?;
+				fs::set_permissions(
+					path,
+					Permissions::from_mode(unix_bind_addr_mode.unwrap_or(0o222)),
+				)?;

-			  //	bound
-			  //		.serve(unix_service)
-			  //		.with_graceful_shutdown(shutdown_signal)
-			  //		.await?;
-			  //}
+				bound
+					.serve(unix_service)
+					.with_graceful_shutdown(shutdown_signal)
+					.await?;
+			}
 		};

 		Ok(())
--- a/src/api/k2v/index.rs
+++ b/src/api/k2v/index.rs
@ -5,7 +5,6 @@ use serde::Serialize;

 use garage_util::data::*;

-use garage_rpc::ring::Ring;
 use garage_table::util::*;

 use garage_model::garage::Garage;
@ -26,7 +25,11 @@ pub async fn handle_read_index(
 ) -> Result<Response<Body>, Error> {
 	let reverse = reverse.unwrap_or(false);

-	let ring: Arc<Ring> = garage.system.ring.borrow().clone();
+	let node_id_vec = garage
+		.system
+		.cluster_layout()
+		.all_nongateway_nodes()
+		.to_vec();

 	let (partition_keys, more, next_start) = read_range(
 		&garage.k2v.counter_table.table,
@ -35,7 +38,7 @@ pub async fn handle_read_index(
 		&start,
 		&end,
 		limit,
-		Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
+		Some((DeletedFilter::NotDeleted, node_id_vec)),
 		EnumerationOrder::from_reverse(reverse),
 	)
 	.await?;
@ -54,7 +57,7 @@ pub async fn handle_read_index(
 		partition_keys: partition_keys
 			.into_iter()
 			.map(|part| {
-				let vals = part.filtered_values(&ring);
+				let vals = part.filtered_values(&garage.system.cluster_layout());
 				ReadIndexResponseEntry {
 					pk: part.sk,
 					entries: *vals.get(&s_entries).unwrap_or(&0),
--- a/src/api/s3/api_server.rs
+++ b/src/api/s3/api_server.rs
@ -344,7 +344,7 @@ impl ApiHandler for S3ApiServer {
 						bucket_id,
 						key,
 						upload_id,
-						part_number_marker: part_number_marker.map(|p| p.min(10000)),
+						part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)),
 						max_parts: max_parts.unwrap_or(1000).clamp(1, 1000),
 					},
 				)
--- a/src/api/s3/put.rs
+++ b/src/api/s3/put.rs
@ -253,7 +253,7 @@ pub(crate) async fn check_quotas(
 		.await?;

 	let counters = counters
-		.map(|x| x.filtered_values(&garage.system.ring.borrow()))
+		.map(|x| x.filtered_values(&garage.system.cluster_layout()))
 		.unwrap_or_default();

 	let (prev_cnt_obj, prev_cnt_size) = match prev_object {
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@ -264,8 +264,10 @@ impl BlockManager {
 		F: Fn(DataBlockHeader, ByteStream) -> Fut,
 		Fut: futures::Future<Output = Result<T, Error>>,
 	{
-		let who = self.replication.read_nodes(hash);
-		let who = self.system.rpc.request_order(&who);
+		let who = self
+			.system
+			.rpc_helper()
+			.block_read_nodes_of(hash, self.system.rpc_helper());

 		for node in who.iter() {
 			let node_id = NodeID::from(*node);
@ -305,7 +307,7 @@ impl BlockManager {
 				// if the first one doesn't succeed rapidly
 				// TODO: keep first request running when initiating a new one and take the
 				// one that finishes earlier
-				_ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => {
+				_ = tokio::time::sleep(self.system.rpc_helper().rpc_timeout()) => {
 					debug!("Get block {:?}: node {:?} didn't return block in time, trying next.", hash, node);
 				}
 			};
@ -354,7 +356,7 @@ impl BlockManager {

 	/// Send block to nodes that should have it
 	pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> {
-		let who = self.replication.write_nodes(&hash);
+		let who = self.replication.write_sets(&hash);

 		let (header, bytes) = DataBlock::from_buffer(data, self.compression_level)
 			.await
@ -363,10 +365,10 @@ impl BlockManager {
 			Req::new(BlockRpc::PutBlock { hash, header })?.with_stream_from_buffer(bytes);

 		self.system
-			.rpc
-			.try_call_many(
+			.rpc_helper()
+			.try_write_many_sets(
 				&self.endpoint,
-				&who[..],
+				who.as_ref(),
 				put_block_rpc,
 				RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY)
 					.with_quorum(self.replication.write_quorum()),
@ -439,7 +441,7 @@ impl BlockManager {
 			tokio::spawn(async move {
 				if let Err(e) = this
 					.resync
-					.put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout())
+					.put_to_resync(&hash, 2 * this.system.rpc_helper().rpc_timeout())
 				{
 					error!("Block {:?} could not be put in resync queue: {}.", hash, e);
 				}
@ -533,7 +535,7 @@ impl BlockManager {
 				None => {
 					// Not found but maybe we should have had it ??
 					self.resync
-						.put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?;
+						.put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?;
 					return Err(Error::Message(format!(
 						"block {:?} not found on node",
 						hash
@ -771,7 +773,11 @@ impl BlockManagerLocked {
 			// Now, we do an fsync on the containing directory, to ensure that the rename
 			// is persisted properly. See:
 			// http://thedjbway.b0llix.net/qmail/syncdir.html
-			let dir = fs::OpenOptions::new().read(true).open(directory).await?;
+			let dir = fs::OpenOptions::new()
+				.read(true)
+				.mode(0)
+				.open(directory)
+				.await?;
 			dir.sync_all().await?;
 			drop(dir);
 		}
--- a/src/block/resync.rs
+++ b/src/block/resync.rs
@ -377,7 +377,7 @@ impl BlockResyncManager {
 			info!("Resync block {:?}: offloading and deleting", hash);
 			let existing_path = existing_path.unwrap();

-			let mut who = manager.replication.write_nodes(hash);
+			let mut who = manager.replication.storage_nodes(hash);
 			if who.len() < manager.replication.write_quorum() {
 				return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
 			}
@ -385,7 +385,7 @@ impl BlockResyncManager {

 			let who_needs_resps = manager
 				.system
-				.rpc
+				.rpc_helper()
 				.call_many(
 					&manager.endpoint,
 					&who,
@ -431,10 +431,10 @@ impl BlockResyncManager {
 				.with_stream_from_buffer(bytes);
 				manager
 					.system
-					.rpc
+					.rpc_helper()
 					.try_call_many(
 						&manager.endpoint,
-						&need_nodes[..],
+						&need_nodes,
 						put_block_message,
 						RequestStrategy::with_priority(PRIO_BACKGROUND)
 							.with_quorum(need_nodes.len()),
--- a/src/garage/admin/bucket.rs
+++ b/src/garage/admin/bucket.rs
@ -70,7 +70,7 @@ impl AdminRpcHandler {
 			.table
 			.get(&bucket_id, &EmptyKey)
 			.await?
-			.map(|x| x.filtered_values(&self.garage.system.ring.borrow()))
+			.map(|x| x.filtered_values(&self.garage.system.cluster_layout()))
 			.unwrap_or_default();

 		let mpu_counters = self
@ -79,7 +79,7 @@ impl AdminRpcHandler {
 			.table
 			.get(&bucket_id, &EmptyKey)
 			.await?
-			.map(|x| x.filtered_values(&self.garage.system.ring.borrow()))
+			.map(|x| x.filtered_values(&self.garage.system.cluster_layout()))
 			.unwrap_or_default();

 		let mut relevant_keys = HashMap::new();
--- a/src/garage/admin/mod.rs
+++ b/src/garage/admin/mod.rs
@ -18,7 +18,7 @@ use garage_util::error::Error as GarageError;
 use garage_table::replication::*;
 use garage_table::*;

-use garage_rpc::ring::PARTITION_BITS;
+use garage_rpc::layout::PARTITION_BITS;
 use garage_rpc::*;

 use garage_block::manager::BlockResyncErrorInfo;
@ -126,8 +126,8 @@ impl AdminRpcHandler {
 			opt_to_send.all_nodes = false;

 			let mut failures = vec![];
-			let ring = self.garage.system.ring.borrow().clone();
-			for node in ring.layout.node_ids().iter() {
+			let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
+			for node in all_nodes.iter() {
 				let node = (*node).into();
 				let resp = self
 					.endpoint
@ -163,9 +163,9 @@ impl AdminRpcHandler {
 	async fn handle_stats(&self, opt: StatsOpt) -> Result<AdminRpc, Error> {
 		if opt.all_nodes {
 			let mut ret = String::new();
-			let ring = self.garage.system.ring.borrow().clone();
+			let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();

-			for node in ring.layout.node_ids().iter() {
+			for node in all_nodes.iter() {
 				let mut opt = opt.clone();
 				opt.all_nodes = false;
 				opt.skip_global = true;
@ -274,11 +274,11 @@ impl AdminRpcHandler {
 	fn gather_cluster_stats(&self) -> String {
 		let mut ret = String::new();

-		// Gather storage node and free space statistics
-		let layout = &self.garage.system.ring.borrow().layout;
+		// Gather storage node and free space statistics for current nodes
+		let layout = &self.garage.system.cluster_layout();
 		let mut node_partition_count = HashMap::<Uuid, u64>::new();
-		for short_id in layout.ring_assignment_data.iter() {
-			let id = layout.node_id_vec[*short_id as usize];
+		for short_id in layout.current().ring_assignment_data.iter() {
+			let id = layout.current().node_id_vec[*short_id as usize];
 			*node_partition_count.entry(id).or_default() += 1;
 		}
 		let node_info = self
@ -293,8 +293,8 @@ impl AdminRpcHandler {
 		for (id, parts) in node_partition_count.iter() {
 			let info = node_info.get(id);
 			let status = info.map(|x| &x.status);
-			let role = layout.roles.get(id).and_then(|x| x.0.as_ref());
-			let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?");
+			let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref());
+			let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?");
 			let zone = role.map(|x| x.zone.as_str()).unwrap_or("?");
 			let capacity = role
 				.map(|x| x.capacity_string())
@ -440,8 +440,8 @@ impl AdminRpcHandler {
 	) -> Result<AdminRpc, Error> {
 		if all_nodes {
 			let mut ret = vec![];
-			let ring = self.garage.system.ring.borrow().clone();
-			for node in ring.layout.node_ids().iter() {
+			let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
+			for node in all_nodes.iter() {
 				let node = (*node).into();
 				match self
 					.endpoint
@ -488,8 +488,8 @@ impl AdminRpcHandler {
 	) -> Result<AdminRpc, Error> {
 		if all_nodes {
 			let mut ret = vec![];
-			let ring = self.garage.system.ring.borrow().clone();
-			for node in ring.layout.node_ids().iter() {
+			let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec();
+			for node in all_nodes.iter() {
 				let node = (*node).into();
 				match self
 					.endpoint
--- a/src/garage/cli/cmd.rs
+++ b/src/garage/cli/cmd.rs
@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::time::Duration;

 use format_table::format_table;
@ -49,21 +49,15 @@ pub async fn cli_command_dispatch(
 }

 pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> {
-	let status = match rpc_cli
-		.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
-		.await??
-	{
-		SystemRpc::ReturnKnownNodes(nodes) => nodes,
-		resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
-	};
+	let status = fetch_status(rpc_cli, rpc_host).await?;
 	let layout = fetch_layout(rpc_cli, rpc_host).await?;

 	println!("==== HEALTHY NODES ====");
 	let mut healthy_nodes =
 		vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()];
 	for adv in status.iter().filter(|adv| adv.is_up) {
-		match layout.roles.get(&adv.id) {
-			Some(NodeRoleV(Some(cfg))) => {
+		let host = adv.status.hostname.as_deref().unwrap_or("?");
+		if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) {
 			let data_avail = match &adv.status.data_disk_avail {
 				_ if cfg.capacity.is_none() => "N/A".into(),
 				Some((avail, total)) => {
@ -76,23 +70,40 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
 			healthy_nodes.push(format!(
 				"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}",
 				id = adv.id,
-					host = adv.status.hostname,
+				host = host,
 				addr = adv.addr,
 				tags = cfg.tags.join(","),
 				zone = cfg.zone,
 				capacity = cfg.capacity_string(),
 				data_avail = data_avail,
 			));
-			}
-			_ => {
-				let new_role = match layout.staging_roles.get(&adv.id) {
-					Some(NodeRoleV(Some(_))) => "(pending)",
+		} else {
+			let prev_role = layout
+				.versions
+				.iter()
+				.rev()
+				.find_map(|x| match x.roles.get(&adv.id) {
+					Some(NodeRoleV(Some(cfg))) => Some(cfg),
+					_ => None,
+				});
+			if let Some(cfg) = prev_role {
+				healthy_nodes.push(format!(
+					"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...",
+					id = adv.id,
+					host = host,
+					addr = adv.addr,
+					tags = cfg.tags.join(","),
+					zone = cfg.zone,
+				));
+			} else {
+				let new_role = match layout.staging.get().roles.get(&adv.id) {
+					Some(NodeRoleV(Some(_))) => "pending...",
 					_ => "NO ROLE ASSIGNED",
 				};
 				healthy_nodes.push(format!(
-					"{id:?}\t{h}\t{addr}\t{new_role}",
+					"{id:?}\t{h}\t{addr}\t\t\t{new_role}",
 					id = adv.id,
-					h = adv.status.hostname,
+					h = host,
 					addr = adv.addr,
 					new_role = new_role,
 				));
@ -101,51 +112,76 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
 	}
 	format_table(healthy_nodes);

-	let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>();
-	let failure_case_1 = status
+	// Determine which nodes are unhealthy and print that to stdout
+	let status_map = status
 		.iter()
-		.any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_)))));
-	let failure_case_2 = layout
-		.roles
-		.items()
-		.iter()
-		.any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some());
-	if failure_case_1 || failure_case_2 {
-		println!("\n==== FAILED NODES ====");
+		.map(|adv| (adv.id, adv))
+		.collect::<HashMap<_, _>>();
+
+	let tf = timeago::Formatter::new();
+	let mut drain_msg = false;
 	let mut failed_nodes =
 		vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()];
-		for adv in status.iter().filter(|adv| !adv.is_up) {
-			if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) {
-				let tf = timeago::Formatter::new();
-				failed_nodes.push(format!(
-					"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
-					id = adv.id,
-					host = adv.status.hostname,
-					addr = adv.addr,
-					tags = cfg.tags.join(","),
-					zone = cfg.zone,
-					capacity = cfg.capacity_string(),
-					last_seen = adv
-						.last_seen_secs_ago
+	let mut listed = HashSet::new();
+	for ver in layout.versions.iter().rev() {
+		for (node, _, role) in ver.roles.items().iter() {
+			let cfg = match role {
+				NodeRoleV(Some(role)) if role.capacity.is_some() => role,
+				_ => continue,
+			};
+
+			if listed.contains(node) {
+				continue;
+			}
+			listed.insert(*node);
+
+			let adv = status_map.get(node);
+			if adv.map(|x| x.is_up).unwrap_or(false) {
+				continue;
+			}
+
+			// Node is in a layout version, is not a gateway node, and is not up:
+			// it is in a failed state, add proper line to the output
+			let (host, addr, last_seen) = match adv {
+				Some(adv) => (
+					adv.status.hostname.as_deref().unwrap_or("?"),
+					adv.addr.to_string(),
+					adv.last_seen_secs_ago
 						.map(|s| tf.convert(Duration::from_secs(s)))
 						.unwrap_or_else(|| "never seen".into()),
-				));
-			}
-		}
-		for (id, _, role_v) in layout.roles.items().iter() {
-			if let NodeRoleV(Some(cfg)) = role_v {
-				if !status_keys.contains(id) {
+				),
+				None => ("??", "??".into(), "never seen".into()),
+			};
+			let capacity = if ver.version == layout.current().version {
+				cfg.capacity_string()
+			} else {
+				drain_msg = true;
+				"draining metadata...".to_string()
+			};
 			failed_nodes.push(format!(
-						"{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen",
-						id = id,
+				"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
+				id = node,
+				host = host,
+				addr = addr,
 				tags = cfg.tags.join(","),
 				zone = cfg.zone,
-						capacity = cfg.capacity_string(),
+				capacity = capacity,
+				last_seen = last_seen,
 			));
 		}
 	}
-		}
+
+	if failed_nodes.len() > 1 {
+		println!("\n==== FAILED NODES ====");
 		format_table(failed_nodes);
+		if drain_msg {
+			println!();
+			println!("Your cluster is expecting to drain data from nodes that are currently unavailable.");
+			println!("If these nodes are definitely dead, please review the layout history with");
+			println!(
+				"`garage layout history` and use `garage layout skip-dead-nodes` to force progress."
+			);
+		}
 	}

 	if print_staging_role_changes(&layout) {
@ -226,3 +262,18 @@ pub async fn cmd_admin(
 	}
 	Ok(())
 }
+
+// ---- utility ----
+
+pub async fn fetch_status(
+	rpc_cli: &Endpoint<SystemRpc, ()>,
+	rpc_host: NodeID,
+) -> Result<Vec<KnownNodeInfo>, Error> {
+	match rpc_cli
+		.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
+		.await??
+	{
+		SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes),
+		resp => Err(Error::unexpected_rpc_message(resp)),
+	}
+}
--- a/src/garage/cli/layout.rs
+++ b/src/garage/cli/layout.rs
@ -32,6 +32,10 @@ pub async fn cli_layout_command_dispatch(
 		LayoutOperation::Config(config_opt) => {
 			cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await
 		}
+		LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await,
+		LayoutOperation::SkipDeadNodes(assume_sync_opt) => {
+			cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await
+		}
 	}
 }

@ -49,6 +53,7 @@ pub async fn cmd_assign_role(
 	};

 	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
+	let all_nodes = layout.get_all_nodes();

 	let added_nodes = args
 		.node_ids
@ -58,21 +63,23 @@ pub async fn cmd_assign_role(
 				status
 					.iter()
 					.map(|adv| adv.id)
-					.chain(layout.node_ids().iter().cloned()),
+					.chain(all_nodes.iter().cloned()),
 				node_id,
 			)
 		})
 		.collect::<Result<Vec<_>, _>>()?;

-	let mut roles = layout.roles.clone();
-	roles.merge(&layout.staging_roles);
+	let mut roles = layout.current().roles.clone();
+	roles.merge(&layout.staging.get().roles);

 	for replaced in args.replace.iter() {
-		let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?;
+		let replaced_node = find_matching_node(all_nodes.iter().cloned(), replaced)?;
 		match roles.get(&replaced_node) {
 			Some(NodeRoleV(Some(_))) => {
 				layout
-					.staging_roles
+					.staging
+					.get_mut()
+					.roles
 					.merge(&roles.update_mutator(replaced_node, NodeRoleV(None)));
 			}
 			_ => {
@ -130,7 +137,9 @@ pub async fn cmd_assign_role(
 		};

 		layout
-			.staging_roles
+			.staging
+			.get_mut()
+			.roles
 			.merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry))));
 	}

@ -149,14 +158,16 @@ pub async fn cmd_remove_role(
 ) -> Result<(), Error> {
 	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;

-	let mut roles = layout.roles.clone();
-	roles.merge(&layout.staging_roles);
+	let mut roles = layout.current().roles.clone();
+	roles.merge(&layout.staging.get().roles);

 	let deleted_node =
 		find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?;

 	layout
-		.staging_roles
+		.staging
+		.get_mut()
+		.roles
 		.merge(&roles.update_mutator(deleted_node, NodeRoleV(None)));

 	send_layout(rpc_cli, rpc_host, layout).await?;
@ -174,13 +185,16 @@ pub async fn cmd_show_layout(
 	let layout = fetch_layout(rpc_cli, rpc_host).await?;

 	println!("==== CURRENT CLUSTER LAYOUT ====");
-	print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
+	print_cluster_layout(layout.current(), "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes.");
 	println!();
-	println!("Current cluster layout version: {}", layout.version);
+	println!(
+		"Current cluster layout version: {}",
+		layout.current().version
+	);

 	let has_role_changes = print_staging_role_changes(&layout);
 	if has_role_changes {
-		let v = layout.version;
+		let v = layout.current().version;
 		let res_apply = layout.apply_staged_changes(Some(v + 1));

 		// this will print the stats of what partitions
@ -189,7 +203,7 @@ pub async fn cmd_show_layout(
 			Ok((layout, msg)) => {
 				println!();
 				println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ====");
-				print_cluster_layout(&layout, "No nodes have a role in the new layout.");
+				print_cluster_layout(layout.current(), "No nodes have a role in the new layout.");
 				println!();

 				for line in msg.iter() {
@ -199,16 +213,12 @@ pub async fn cmd_show_layout(
 				println!();
 				println!("    garage layout apply --version {}", v + 1);
 				println!();
-				println!(
-                    "You can also revert all proposed changes with: garage layout revert --version {}",
-                    v + 1)
+				println!("You can also revert all proposed changes with: garage layout revert");
 			}
 			Err(e) => {
 				println!("Error while trying to compute the assignment: {}", e);
 				println!("This new layout cannot yet be applied.");
-				println!(
-                    "You can also revert all proposed changes with: garage layout revert --version {}",
-                    v + 1)
+				println!("You can also revert all proposed changes with: garage layout revert");
 			}
 		}
 	}
@ -241,9 +251,15 @@ pub async fn cmd_revert_layout(
 	rpc_host: NodeID,
 	revert_opt: RevertLayoutOpt,
 ) -> Result<(), Error> {
+	if !revert_opt.yes {
+		return Err(Error::Message(
+			"Please add the --yes flag to run the layout revert operation".into(),
+		));
+	}
+
 	let layout = fetch_layout(rpc_cli, rpc_host).await?;

-	let layout = layout.revert_staged_changes(revert_opt.version)?;
+	let layout = layout.revert_staged_changes()?;

 	send_layout(rpc_cli, rpc_host, layout).await?;

@ -266,11 +282,11 @@ pub async fn cmd_config_layout(
 				.parse::<ZoneRedundancy>()
 				.ok_or_message("invalid zone redundancy value")?;
 			if let ZoneRedundancy::AtLeast(r_int) = r {
-				if r_int > layout.replication_factor {
+				if r_int > layout.current().replication_factor {
 					return Err(Error::Message(format!(
 						"The zone redundancy must be smaller or equal to the \
                    replication factor ({}).",
-						layout.replication_factor
+						layout.current().replication_factor
 					)));
 				} else if r_int < 1 {
 					return Err(Error::Message(
@ -280,7 +296,9 @@ pub async fn cmd_config_layout(
 			}

 			layout
-				.staging_parameters
+				.staging
+				.get_mut()
+				.parameters
 				.update(LayoutParameters { zone_redundancy: r });
 			println!("The zone redundancy parameter has been set to '{}'.", r);
 			did_something = true;
@ -297,25 +315,166 @@ pub async fn cmd_config_layout(
 	Ok(())
 }

+pub async fn cmd_layout_history(
+	rpc_cli: &Endpoint<SystemRpc, ()>,
+	rpc_host: NodeID,
+) -> Result<(), Error> {
+	let layout = fetch_layout(rpc_cli, rpc_host).await?;
+	let min_stored = layout.min_stored();
+
+	println!("==== LAYOUT HISTORY ====");
+	let mut table = vec!["Version\tStatus\tStorage nodes\tGateway nodes".to_string()];
+	for ver in layout
+		.versions
+		.iter()
+		.rev()
+		.chain(layout.old_versions.iter().rev())
+	{
+		let status = if ver.version == layout.current().version {
+			"current"
+		} else if ver.version >= min_stored {
+			"draining"
+		} else {
+			"historical"
+		};
+		table.push(format!(
+			"#{}\t{}\t{}\t{}",
+			ver.version,
+			status,
+			ver.roles
+				.items()
+				.iter()
+				.filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_some()))
+				.count(),
+			ver.roles
+				.items()
+				.iter()
+				.filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_none()))
+				.count(),
+		));
+	}
+	format_table(table);
+	println!();
+
+	if layout.versions.len() > 1 {
+		println!("==== UPDATE TRACKERS ====");
+		println!("Several layout versions are currently live in the version, and data is being migrated.");
+		println!(
+			"This is the internal data that Garage stores to know which nodes have what data."
+		);
+		println!();
+		let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()];
+		let all_nodes = layout.get_all_nodes();
+		for node in all_nodes.iter() {
+			table.push(format!(
+				"{:?}\t#{}\t#{}\t#{}",
+				node,
+				layout.update_trackers.ack_map.get(node, min_stored),
+				layout.update_trackers.sync_map.get(node, min_stored),
+				layout.update_trackers.sync_ack_map.get(node, min_stored),
+			));
+		}
+		table[1..].sort();
+		format_table(table);
+
+		println!();
+		println!(
+			"If some nodes are not catching up to the latest layout version in the update trackers,"
+		);
+		println!("it might be because they are offline or unable to complete a sync successfully.");
+		println!(
+			"You may force progress using `garage layout skip-dead-nodes --version {}`",
+			layout.current().version
+		);
+	} else {
+		println!("Your cluster is currently in a stable state with a single live layout version.");
+		println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,");
+		println!(
+			"so you might want to keep old nodes online until their data directories become empty."
+		);
+	}
+
+	Ok(())
+}
+
+pub async fn cmd_layout_skip_dead_nodes(
+	rpc_cli: &Endpoint<SystemRpc, ()>,
+	rpc_host: NodeID,
+	opt: SkipDeadNodesOpt,
+) -> Result<(), Error> {
+	let status = fetch_status(rpc_cli, rpc_host).await?;
+	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
+
+	if layout.versions.len() == 1 {
+		return Err(Error::Message(
+			"This command cannot be called when there is only one live cluster layout version"
+				.into(),
+		));
+	}
+
+	let min_v = layout.min_stored();
+	if opt.version <= min_v || opt.version > layout.current().version {
+		return Err(Error::Message(format!(
+			"Invalid version, you may use the following version numbers: {}",
+			(min_v + 1..=layout.current().version)
+				.map(|x| x.to_string())
+				.collect::<Vec<_>>()
+				.join(" ")
+		)));
+	}
+
+	let all_nodes = layout.get_all_nodes();
+	let mut did_something = false;
+	for node in all_nodes.iter() {
+		if status.iter().any(|x| x.id == *node && x.is_up) {
+			continue;
+		}
+
+		if layout.update_trackers.ack_map.set_max(*node, opt.version) {
+			println!("Increased the ACK tracker for node {:?}", node);
+			did_something = true;
+		}
+
+		if opt.allow_missing_data {
+			if layout.update_trackers.sync_map.set_max(*node, opt.version) {
+				println!("Increased the SYNC tracker for node {:?}", node);
+				did_something = true;
+			}
+		}
+	}
+
+	if did_something {
+		send_layout(rpc_cli, rpc_host, layout).await?;
+		println!("Success.");
+		Ok(())
+	} else if !opt.allow_missing_data {
+		Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into()))
+	} else {
+		Err(Error::Message(
+			"Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(),
+		))
+	}
+}
+
 // --- utility ---

 pub async fn fetch_layout(
 	rpc_cli: &Endpoint<SystemRpc, ()>,
 	rpc_host: NodeID,
-) -> Result<ClusterLayout, Error> {
+) -> Result<LayoutHistory, Error> {
 	match rpc_cli
 		.call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL)
 		.await??
 	{
 		SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
-		resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
+		resp => Err(Error::unexpected_rpc_message(resp)),
 	}
 }

 pub async fn send_layout(
 	rpc_cli: &Endpoint<SystemRpc, ()>,
 	rpc_host: NodeID,
-	layout: ClusterLayout,
+	layout: LayoutHistory,
 ) -> Result<(), Error> {
 	rpc_cli
 		.call(
@ -327,7 +486,7 @@ pub async fn send_layout(
 	Ok(())
 }

-pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
+pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) {
 	let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()];
 	for (id, _, role) in layout.roles.items().iter() {
 		let role = match &role.0 {
@ -366,21 +525,22 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) {
 	}
 }

-pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
-	let has_role_changes = layout
-		.staging_roles
+pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool {
+	let staging = layout.staging.get();
+	let has_role_changes = staging
+		.roles
 		.items()
 		.iter()
-		.any(|(k, _, v)| layout.roles.get(k) != Some(v));
-	let has_layout_changes = *layout.staging_parameters.get() != layout.parameters;
+		.any(|(k, _, v)| layout.current().roles.get(k) != Some(v));
+	let has_layout_changes = *staging.parameters.get() != layout.current().parameters;

 	if has_role_changes || has_layout_changes {
 		println!();
 		println!("==== STAGED ROLE CHANGES ====");
 		if has_role_changes {
 			let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
-			for (id, _, role) in layout.staging_roles.items().iter() {
-				if layout.roles.get(id) == Some(role) {
+			for (id, _, role) in staging.roles.items().iter() {
+				if layout.current().roles.get(id) == Some(role) {
 					continue;
 				}
 				if let Some(role) = &role.0 {
@ -402,7 +562,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
 		if has_layout_changes {
 			println!(
 				"Zone redundancy: {}",
-				layout.staging_parameters.get().zone_redundancy
+				staging.parameters.get().zone_redundancy
 			);
 		}
 		true
--- a/src/garage/cli/structs.rs
+++ b/src/garage/cli/structs.rs
@ -112,6 +112,14 @@ pub enum LayoutOperation {
 	/// Revert staged changes to cluster layout
 	#[structopt(name = "revert", version = garage_version())]
 	Revert(RevertLayoutOpt),
+
+	/// View the history of layouts in the cluster
+	#[structopt(name = "history", version = garage_version())]
+	History,
+
+	/// Skip dead nodes when awaiting for a new layout version to be synchronized
+	#[structopt(name = "skip-dead-nodes", version = garage_version())]
+	SkipDeadNodes(SkipDeadNodesOpt),
 }

 #[derive(StructOpt, Debug)]
@ -164,9 +172,21 @@ pub struct ApplyLayoutOpt {

 #[derive(StructOpt, Debug)]
 pub struct RevertLayoutOpt {
-	/// Version number of old configuration to which to revert
+	/// The revert operation will not be ran unless this flag is added
+	#[structopt(long = "yes")]
+	pub(crate) yes: bool,
+}
+
+#[derive(StructOpt, Debug)]
+pub struct SkipDeadNodesOpt {
+	/// Version number of the layout to assume is currently up-to-date.
+	/// This will generally be the current layout version.
 	#[structopt(long = "version")]
-	pub(crate) version: Option<u64>,
+	pub(crate) version: u64,
+	/// Allow the skip even if a quorum of ndoes could not be found for
+	/// the data among the remaining nodes
+	#[structopt(long = "allow-missing-data")]
+	pub(crate) allow_missing_data: bool,
 }

 #[derive(Serialize, Deserialize, StructOpt, Debug)]
--- a/src/garage/cli/util.rs
+++ b/src/garage/cli/util.rs
@ -450,6 +450,8 @@ pub fn print_block_info(

 	if refcount != nondeleted_count {
 		println!();
-		println!("Warning: refcount does not match number of non-deleted versions");
+		println!(
+			"Warning: refcount does not match number of non-deleted versions (see issue #644)."
+		);
 	}
 }
--- a/src/garage/tests/common/ext/process.rs
+++ b/src/garage/tests/common/ext/process.rs
@ -14,42 +14,20 @@ impl CommandExt for process::Command {
 	}

 	fn expect_success_status(&mut self, msg: &str) -> process::ExitStatus {
-		let status = self.status().expect(msg);
-		status.expect_success(msg);
-		status
+		self.expect_success_output(msg).status
 	}
 	fn expect_success_output(&mut self, msg: &str) -> process::Output {
 		let output = self.output().expect(msg);
-		output.expect_success(msg);
+		if !output.status.success() {
+			panic!(
+				"{}: command {:?} exited with error {:?}\nSTDOUT: {}\nSTDERR: {}",
+				msg,
+				self,
+				output.status.code(),
+				String::from_utf8_lossy(&output.stdout),
+				String::from_utf8_lossy(&output.stderr)
+			);
+		}
 		output
 	}
 }
-
-pub trait OutputExt {
-	fn expect_success(&self, msg: &str);
-}
-
-impl OutputExt for process::Output {
-	fn expect_success(&self, msg: &str) {
-		self.status.expect_success(msg)
-	}
-}
-
-pub trait ExitStatusExt {
-	fn expect_success(&self, msg: &str);
-}
-
-impl ExitStatusExt for process::ExitStatus {
-	fn expect_success(&self, msg: &str) {
-		if !self.success() {
-			match self.code() {
-				Some(code) => panic!(
-					"Command exited with code {code}: {msg}",
-					code = code,
-					msg = msg
-				),
-				None => panic!("Command exited with signal: {msg}", msg = msg),
-			}
-		}
-	}
-}
--- a/src/garage/tests/common/garage.rs
+++ b/src/garage/tests/common/garage.rs
@ -96,7 +96,7 @@ api_bind_addr = "127.0.0.1:{admin_port}"
 			.arg("server")
 			.stdout(stdout)
 			.stderr(stderr)
-			.env("RUST_LOG", "garage=info,garage_api=trace")
+			.env("RUST_LOG", "garage=debug,garage_api=trace")
 			.spawn()
 			.expect("Could not start garage");

--- a/src/model/helper/bucket.rs
+++ b/src/model/helper/bucket.rs
@ -450,10 +450,12 @@ impl<'a> BucketHelper<'a> {

 		#[cfg(feature = "k2v")]
 		{
-			use garage_rpc::ring::Ring;
-			use std::sync::Arc;
-
-			let ring: Arc<Ring> = self.0.system.ring.borrow().clone();
+			let node_id_vec = self
+				.0
+				.system
+				.cluster_layout()
+				.all_nongateway_nodes()
+				.to_vec();
 			let k2vindexes = self
 				.0
 				.k2v
@ -462,7 +464,7 @@ impl<'a> BucketHelper<'a> {
 				.get_range(
 					&bucket_id,
 					None,
-					Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
+					Some((DeletedFilter::NotDeleted, node_id_vec)),
 					10,
 					EnumerationOrder::Forward,
 				)
--- a/src/model/index_counter.rs
+++ b/src/model/index_counter.rs
@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};

 use garage_db as db;

-use garage_rpc::ring::Ring;
+use garage_rpc::layout::LayoutHelper;
 use garage_rpc::system::System;
 use garage_util::background::BackgroundRunner;
 use garage_util::data::*;
@ -83,9 +83,9 @@ impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> {
 }

 impl<T: CountedItem> CounterEntry<T> {
-	pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> {
-		let nodes = &ring.layout.node_id_vec[..];
-		self.filtered_values_with_nodes(nodes)
+	pub fn filtered_values(&self, layout: &LayoutHelper) -> HashMap<String, i64> {
+		let nodes = layout.all_nongateway_nodes();
+		self.filtered_values_with_nodes(&nodes)
 	}

 	pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> {
--- a/src/model/k2v/rpc.rs
+++ b/src/model/k2v/rpc.rs
@ -127,23 +127,21 @@ impl K2VRpcHandler {
 			.item_table
 			.data
 			.replication
-			.write_nodes(&partition.hash());
+			.storage_nodes(&partition.hash());
 		who.sort();

 		self.system
-			.rpc
+			.rpc_helper()
 			.try_call_many(
 				&self.endpoint,
-				&who[..],
+				&who,
 				K2VRpc::InsertItem(InsertedItem {
 					partition,
 					sort_key,
 					causal_context,
 					value,
 				}),
-				RequestStrategy::with_priority(PRIO_NORMAL)
-					.with_quorum(1)
-					.interrupt_after_quorum(true),
+				RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1),
 			)
 			.await?;

@ -168,7 +166,7 @@ impl K2VRpcHandler {
 				.item_table
 				.data
 				.replication
-				.write_nodes(&partition.hash());
+				.storage_nodes(&partition.hash());
 			who.sort();

 			call_list.entry(who).or_default().push(InsertedItem {
@ -187,14 +185,12 @@ impl K2VRpcHandler {
 		let call_futures = call_list.into_iter().map(|(nodes, items)| async move {
 			let resp = self
 				.system
-				.rpc
+				.rpc_helper()
 				.try_call_many(
 					&self.endpoint,
 					&nodes[..],
 					K2VRpc::InsertManyItems(items),
-					RequestStrategy::with_priority(PRIO_NORMAL)
-						.with_quorum(1)
-						.interrupt_after_quorum(true),
+					RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1),
 				)
 				.await?;
 			Ok::<_, Error>((nodes, resp))
@ -223,15 +219,16 @@ impl K2VRpcHandler {
 			},
 			sort_key,
 		};
+		// TODO figure this out with write sets, is it still appropriate???
 		let nodes = self
 			.item_table
 			.data
 			.replication
-			.write_nodes(&poll_key.partition.hash());
+			.read_nodes(&poll_key.partition.hash());

-		let rpc = self.system.rpc.try_call_many(
+		let rpc = self.system.rpc_helper().try_call_many(
 			&self.endpoint,
-			&nodes[..],
+			&nodes,
 			K2VRpc::PollItem {
 				key: poll_key,
 				causal_context,
@ -239,9 +236,11 @@ impl K2VRpcHandler {
 			},
 			RequestStrategy::with_priority(PRIO_NORMAL)
 				.with_quorum(self.item_table.data.replication.read_quorum())
+				.send_all_at_once(true)
 				.without_timeout(),
 		);
-		let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
+		let timeout_duration =
+			Duration::from_millis(timeout_msec) + self.system.rpc_helper().rpc_timeout();
 		let resps = select! {
 			r = rpc => r?,
 			_ = tokio::time::sleep(timeout_duration) => return Ok(None),
@ -283,11 +282,12 @@ impl K2VRpcHandler {
 		seen.restrict(&range);

 		// Prepare PollRange RPC to send to the storage nodes responsible for the parititon
+		// TODO figure this out with write sets, does it still work????
 		let nodes = self
 			.item_table
 			.data
 			.replication
-			.write_nodes(&range.partition.hash());
+			.read_nodes(&range.partition.hash());
 		let quorum = self.item_table.data.replication.read_quorum();
 		let msg = K2VRpc::PollRange {
 			range,
@ -300,7 +300,11 @@ impl K2VRpcHandler {
 		let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout();
 		let mut requests = nodes
 			.iter()
-			.map(|node| self.system.rpc.call(&self.endpoint, *node, msg.clone(), rs))
+			.map(|node| {
+				self.system
+					.rpc_helper()
+					.call(&self.endpoint, *node, msg.clone(), rs)
+			})
 			.collect::<FuturesUnordered<_>>();

 		// Fetch responses. This procedure stops fetching responses when any of the following
@ -316,8 +320,9 @@ impl K2VRpcHandler {
 		// kind: all items produced by that node until time ts have been returned, so we can
 		// bump the entry in the global vector clock and possibly remove some item-specific
 		// vector clocks)
-		let mut deadline =
-			Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
+		let mut deadline = Instant::now()
+			+ Duration::from_millis(timeout_msec)
+			+ self.system.rpc_helper().rpc_timeout();
 		let mut resps = vec![];
 		let mut errors = vec![];
 		loop {
@ -339,7 +344,7 @@ impl K2VRpcHandler {
 		}
 		if errors.len() > nodes.len() - quorum {
 			let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
-			return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into());
+			return Err(Error::Quorum(quorum, None, resps.len(), nodes.len(), errors).into());
 		}

 		// Take all returned items into account to produce the response.
--- a/src/rpc/layout/graph_algo.rs
+++ b/src/rpc/layout/graph_algo.rs
@ -114,16 +114,6 @@ impl Graph<FlowEdge> {
 		Ok(result)
 	}

-	/// This function returns the value of the flow incoming to v.
-	pub fn get_inflow(&self, v: Vertex) -> Result<i64, String> {
-		let idv = self.get_vertex_id(&v)?;
-		let mut result = 0;
-		for edge in self.graph[idv].iter() {
-			result += max(0, self.graph[edge.dest][edge.rev].flow);
-		}
-		Ok(result)
-	}
-
 	/// This function returns the value of the flow outgoing from v.
 	pub fn get_outflow(&self, v: Vertex) -> Result<i64, String> {
 		let idv = self.get_vertex_id(&v)?;
--- a/src/rpc/layout/helper.rs
+++ b/src/rpc/layout/helper.rs
@ -0,0 +1,294 @@
+use std::collections::HashMap;
+use std::ops::Deref;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+
+use super::*;
+use crate::replication_mode::ReplicationMode;
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
+pub struct RpcLayoutDigest {
+	/// Cluster layout version
+	pub current_version: u64,
+	/// Number of active layout versions
+	pub active_versions: usize,
+	/// Hash of cluster layout update trackers
+	pub trackers_hash: Hash,
+	/// Hash of cluster layout staging data
+	pub staging_hash: Hash,
+}
+
+#[derive(Debug, Clone, Copy, Eq, PartialEq)]
+pub struct SyncLayoutDigest {
+	current: u64,
+	ack_map_min: u64,
+	min_stored: u64,
+}
+
+pub struct LayoutHelper {
+	replication_mode: ReplicationMode,
+	layout: Option<LayoutHistory>,
+
+	// cached values
+	ack_map_min: u64,
+	sync_map_min: u64,
+
+	all_nodes: Vec<Uuid>,
+	all_nongateway_nodes: Vec<Uuid>,
+
+	trackers_hash: Hash,
+	staging_hash: Hash,
+
+	// ack lock: counts in-progress write operations for each
+	// layout version ; we don't increase the ack update tracker
+	// while this lock is nonzero
+	pub(crate) ack_lock: HashMap<u64, AtomicUsize>,
+}
+
+impl Deref for LayoutHelper {
+	type Target = LayoutHistory;
+	fn deref(&self) -> &LayoutHistory {
+		self.layout()
+	}
+}
+
+impl LayoutHelper {
+	pub fn new(
+		replication_mode: ReplicationMode,
+		mut layout: LayoutHistory,
+		mut ack_lock: HashMap<u64, AtomicUsize>,
+	) -> Self {
+		// In the new() function of the helper, we do a bunch of cleanup
+		// and calculations on the layout history to make sure things are
+		// correct and we have rapid access to important values such as
+		// the layout versions to use when reading to ensure consistency.
+
+		if !replication_mode.is_read_after_write_consistent() {
+			// Fast path for when no consistency is required.
+			// In this case we only need to keep the last version of the layout,
+			// we don't care about coordinating stuff in the cluster.
+			layout.keep_current_version_only();
+		}
+
+		layout.cleanup_old_versions();
+
+		let all_nodes = layout.get_all_nodes();
+		let all_nongateway_nodes = layout.get_all_nongateway_nodes();
+
+		layout.clamp_update_trackers(&all_nodes);
+
+		let min_version = layout.min_stored();
+
+		// ack_map_min is the minimum value of ack_map among all nodes
+		// in the cluster (gateway, non-gateway, current and previous layouts).
+		// It is the highest layout version which all of these nodes have
+		// acknowledged, indicating that they are aware of it and are no
+		// longer processing write operations that did not take it into account.
+		let ack_map_min = layout
+			.update_trackers
+			.ack_map
+			.min_among(&all_nodes, min_version);
+
+		// sync_map_min is the minimum value of sync_map among storage nodes
+		// in the cluster (non-gateway nodes only, current and previous layouts).
+		// It is the highest layout version for which we know that all relevant
+		// storage nodes have fullfilled a sync, and therefore it is safe to
+		// use a read quorum within that layout to ensure consistency.
+		// Gateway nodes are excluded here because they hold no relevant data
+		// (they store the bucket and access key tables, but we don't have
+		// consistency on those).
+		// This value is calculated using quorums to allow progress even
+		// if not all nodes have successfully completed a sync.
+		let sync_map_min =
+			layout.calculate_sync_map_min_with_quorum(replication_mode, &all_nongateway_nodes);
+
+		let trackers_hash = layout.calculate_trackers_hash();
+		let staging_hash = layout.calculate_staging_hash();
+
+		ack_lock.retain(|_, cnt| *cnt.get_mut() > 0);
+		ack_lock
+			.entry(layout.current().version)
+			.or_insert(AtomicUsize::new(0));
+
+		LayoutHelper {
+			replication_mode,
+			layout: Some(layout),
+			ack_map_min,
+			sync_map_min,
+			all_nodes,
+			all_nongateway_nodes,
+			trackers_hash,
+			staging_hash,
+			ack_lock,
+		}
+	}
+
+	// ------------------ single updating function --------------
+
+	fn layout(&self) -> &LayoutHistory {
+		self.layout.as_ref().unwrap()
+	}
+
+	pub(crate) fn update<F>(&mut self, f: F) -> bool
+	where
+		F: FnOnce(&mut LayoutHistory) -> bool,
+	{
+		let changed = f(self.layout.as_mut().unwrap());
+		if changed {
+			*self = Self::new(
+				self.replication_mode,
+				self.layout.take().unwrap(),
+				std::mem::take(&mut self.ack_lock),
+			);
+		}
+		changed
+	}
+
+	// ------------------ read helpers ---------------
+
+	pub fn all_nodes(&self) -> &[Uuid] {
+		&self.all_nodes
+	}
+
+	pub fn all_nongateway_nodes(&self) -> &[Uuid] {
+		&self.all_nongateway_nodes
+	}
+
+	pub fn ack_map_min(&self) -> u64 {
+		self.ack_map_min
+	}
+
+	pub fn sync_map_min(&self) -> u64 {
+		self.sync_map_min
+	}
+
+	pub fn sync_digest(&self) -> SyncLayoutDigest {
+		SyncLayoutDigest {
+			current: self.layout().current().version,
+			ack_map_min: self.ack_map_min(),
+			min_stored: self.layout().min_stored(),
+		}
+	}
+
+	pub fn read_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
+		let sync_min = self.sync_map_min;
+		let version = self
+			.layout()
+			.versions
+			.iter()
+			.find(|x| x.version == sync_min)
+			.or(self.layout().versions.last())
+			.unwrap();
+		version
+			.nodes_of(position, version.replication_factor)
+			.collect()
+	}
+
+	pub fn storage_sets_of(&self, position: &Hash) -> Vec<Vec<Uuid>> {
+		self.layout()
+			.versions
+			.iter()
+			.map(|x| x.nodes_of(position, x.replication_factor).collect())
+			.collect()
+	}
+
+	pub fn storage_nodes_of(&self, position: &Hash) -> Vec<Uuid> {
+		let mut ret = vec![];
+		for version in self.layout().versions.iter() {
+			ret.extend(version.nodes_of(position, version.replication_factor));
+		}
+		ret.sort();
+		ret.dedup();
+		ret
+	}
+
+	pub fn trackers_hash(&self) -> Hash {
+		self.trackers_hash
+	}
+
+	pub fn staging_hash(&self) -> Hash {
+		self.staging_hash
+	}
+
+	pub fn digest(&self) -> RpcLayoutDigest {
+		RpcLayoutDigest {
+			current_version: self.current().version,
+			active_versions: self.versions.len(),
+			trackers_hash: self.trackers_hash,
+			staging_hash: self.staging_hash,
+		}
+	}
+
+	// ------------------ helpers for update tracking ---------------
+
+	pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) {
+		// Ensure trackers for this node's values are up-to-date
+
+		// 1. Acknowledge the last layout version which is not currently
+		//    locked by an in-progress write operation
+		self.ack_max_free(local_node_id);
+
+		// 2. Assume the data on this node is sync'ed up at least to
+		//    the first layout version in the history
+		self.sync_first(local_node_id);
+
+		// 3. Acknowledge everyone has synced up to min(self.sync_map)
+		self.sync_ack(local_node_id);
+
+		debug!("ack_map: {:?}", self.update_trackers.ack_map);
+		debug!("sync_map: {:?}", self.update_trackers.sync_map);
+		debug!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map);
+	}
+
+	fn sync_first(&mut self, local_node_id: Uuid) {
+		let first_version = self.min_stored();
+		self.update(|layout| {
+			layout
+				.update_trackers
+				.sync_map
+				.set_max(local_node_id, first_version)
+		});
+	}
+
+	fn sync_ack(&mut self, local_node_id: Uuid) {
+		let sync_map_min = self.sync_map_min;
+		self.update(|layout| {
+			layout
+				.update_trackers
+				.sync_ack_map
+				.set_max(local_node_id, sync_map_min)
+		});
+	}
+
+	pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool {
+		let max_ack = self.max_free_ack();
+		let changed = self.update(|layout| {
+			layout
+				.update_trackers
+				.ack_map
+				.set_max(local_node_id, max_ack)
+		});
+		if changed {
+			info!("ack_until updated to {}", max_ack);
+		}
+		changed
+	}
+
+	pub(crate) fn max_free_ack(&self) -> u64 {
+		self.layout()
+			.versions
+			.iter()
+			.map(|x| x.version)
+			.skip_while(|v| {
+				self.ack_lock
+					.get(v)
+					.map(|x| x.load(Ordering::Relaxed) == 0)
+					.unwrap_or(true)
+			})
+			.next()
+			.unwrap_or(self.current().version)
+	}
+}
--- a/src/rpc/layout/history.rs
+++ b/src/rpc/layout/history.rs
@ -0,0 +1,306 @@
+use std::collections::HashSet;
+
+use garage_util::crdt::{Crdt, Lww, LwwMap};
+use garage_util::data::*;
+use garage_util::encode::nonversioned_encode;
+use garage_util::error::*;
+
+use super::*;
+use crate::replication_mode::ReplicationMode;
+
+impl LayoutHistory {
+	pub fn new(replication_factor: usize) -> Self {
+		let version = LayoutVersion::new(replication_factor);
+
+		let staging = LayoutStaging {
+			parameters: Lww::<LayoutParameters>::new(version.parameters),
+			roles: LwwMap::new(),
+		};
+
+		LayoutHistory {
+			versions: vec![version],
+			old_versions: vec![],
+			update_trackers: Default::default(),
+			staging: Lww::raw(0, staging),
+		}
+	}
+
+	// ------------------ who stores what now? ---------------
+
+	pub fn current(&self) -> &LayoutVersion {
+		self.versions.last().as_ref().unwrap()
+	}
+
+	pub fn min_stored(&self) -> u64 {
+		self.versions.first().as_ref().unwrap().version
+	}
+
+	pub fn get_all_nodes(&self) -> Vec<Uuid> {
+		if self.versions.len() == 1 {
+			self.versions[0].all_nodes().to_vec()
+		} else {
+			let set = self
+				.versions
+				.iter()
+				.flat_map(|x| x.all_nodes())
+				.collect::<HashSet<_>>();
+			set.into_iter().copied().collect::<Vec<_>>()
+		}
+	}
+
+	pub(crate) fn get_all_nongateway_nodes(&self) -> Vec<Uuid> {
+		if self.versions.len() == 1 {
+			self.versions[0].nongateway_nodes().to_vec()
+		} else {
+			let set = self
+				.versions
+				.iter()
+				.flat_map(|x| x.nongateway_nodes())
+				.collect::<HashSet<_>>();
+			set.into_iter().copied().collect::<Vec<_>>()
+		}
+	}
+
+	// ---- housekeeping (all invoked by LayoutHelper) ----
+
+	pub(crate) fn keep_current_version_only(&mut self) {
+		while self.versions.len() > 1 {
+			let removed = self.versions.remove(0);
+			self.old_versions.push(removed);
+		}
+	}
+
+	pub(crate) fn cleanup_old_versions(&mut self) {
+		// If there are invalid versions before valid versions, remove them
+		if self.versions.len() > 1 && self.current().check().is_ok() {
+			while self.versions.len() > 1 && self.versions.first().unwrap().check().is_err() {
+				let removed = self.versions.remove(0);
+				info!(
+					"Layout history: pruning old invalid version {}",
+					removed.version
+				);
+			}
+		}
+
+		// If there are old versions that no one is reading from anymore,
+		// remove them (keep them in self.old_versions).
+		// ASSUMPTION: we only care about where nodes in the current layout version
+		// are reading from, as we assume older nodes are being discarded.
+		let current_nodes = &self.current().node_id_vec;
+		let min_version = self.min_stored();
+		let sync_ack_map_min = self
+			.update_trackers
+			.sync_ack_map
+			.min_among(current_nodes, min_version);
+		while self.min_stored() < sync_ack_map_min {
+			assert!(self.versions.len() > 1);
+			let removed = self.versions.remove(0);
+			info!(
+				"Layout history: moving version {} to old_versions",
+				removed.version
+			);
+			self.old_versions.push(removed);
+		}
+
+		while self.old_versions.len() > OLD_VERSION_COUNT {
+			let removed = self.old_versions.remove(0);
+			info!("Layout history: removing old_version {}", removed.version);
+		}
+	}
+
+	pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) {
+		let min_v = self.min_stored();
+		for node in nodes {
+			self.update_trackers.ack_map.set_max(*node, min_v);
+			self.update_trackers.sync_map.set_max(*node, min_v);
+			self.update_trackers.sync_ack_map.set_max(*node, min_v);
+		}
+	}
+
+	pub(crate) fn calculate_sync_map_min_with_quorum(
+		&self,
+		replication_mode: ReplicationMode,
+		all_nongateway_nodes: &[Uuid],
+	) -> u64 {
+		// This function calculates the minimum layout version from which
+		// it is safe to read if we want to maintain read-after-write consistency.
+		// In the general case the computation can be a bit expensive so
+		// we try to optimize it in several ways.
+
+		// If there is only one layout version, we know that's the one
+		// we need to read from.
+		if self.versions.len() == 1 {
+			return self.current().version;
+		}
+
+		let quorum = replication_mode.write_quorum();
+
+		let min_version = self.min_stored();
+		let global_min = self
+			.update_trackers
+			.sync_map
+			.min_among(all_nongateway_nodes, min_version);
+
+		// If the write quorums are equal to the total number of nodes,
+		// i.e. no writes can succeed while they are not written to all nodes,
+		// then we must in all case wait for all nodes to complete a sync.
+		// This is represented by reading from the layout with version
+		// number global_min, the smallest layout version for which all nodes
+		// have completed a sync.
+		if quorum == self.current().replication_factor {
+			return global_min;
+		}
+
+		// In the general case, we need to look at all write sets for all partitions,
+		// and find a safe layout version to read for that partition. We then
+		// take the minimum value among all partition as the safe layout version
+		// to read in all cases (the layout version to which all reads are directed).
+		let mut current_min = self.current().version;
+		let mut sets_done = HashSet::<Vec<Uuid>>::new();
+
+		for (_, p_hash) in self.current().partitions() {
+			for v in self.versions.iter() {
+				if v.version == self.current().version {
+					// We don't care about whether nodes in the latest layout version
+					// have completed a sync or not, as the sync is push-only
+					// and by definition nodes in the latest layout version do not
+					// hold data that must be pushed to nodes in the latest layout
+					// version, since that's the same version (any data that's
+					// already in the latest version is assumed to have been written
+					// by an operation that ensured a quorum of writes within
+					// that version).
+					continue;
+				}
+
+				// Determine set of nodes for partition p in layout version v.
+				// Sort the node set to avoid duplicate computations.
+				let mut set = v
+					.nodes_of(&p_hash, v.replication_factor)
+					.collect::<Vec<Uuid>>();
+				set.sort();
+
+				// If this set was already processed, skip it.
+				if sets_done.contains(&set) {
+					continue;
+				}
+
+				// Find the value of the sync update trackers that is the
+				// highest possible minimum within a quorum of nodes.
+				let mut sync_values = set
+					.iter()
+					.map(|x| self.update_trackers.sync_map.get(x, min_version))
+					.collect::<Vec<_>>();
+				sync_values.sort();
+				let set_min = sync_values[sync_values.len() - quorum];
+				if set_min < current_min {
+					current_min = set_min;
+				}
+				// defavorable case, we know we are at the smallest possible version,
+				// so we can stop early
+				assert!(current_min >= global_min);
+				if current_min == global_min {
+					return current_min;
+				}
+
+				// Add set to already processed sets
+				sets_done.insert(set);
+			}
+		}
+
+		current_min
+	}
+
+	pub(crate) fn calculate_trackers_hash(&self) -> Hash {
+		blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..])
+	}
+
+	pub(crate) fn calculate_staging_hash(&self) -> Hash {
+		blake2sum(&nonversioned_encode(&self.staging).unwrap()[..])
+	}
+
+	// ================== updates to layout, public interface ===================
+
+	pub fn merge(&mut self, other: &LayoutHistory) -> bool {
+		let mut changed = false;
+
+		// Add any new versions to history
+		for v2 in other.versions.iter() {
+			if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) {
+				// Version is already present, check consistency
+				if v1 != v2 {
+					error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version);
+				}
+			} else if self.versions.iter().all(|v| v.version != v2.version - 1) {
+				error!(
+					"Cannot receive new layout version {}, version {} is missing",
+					v2.version,
+					v2.version - 1
+				);
+			} else {
+				self.versions.push(v2.clone());
+				changed = true;
+			}
+		}
+
+		// Merge trackers
+		let c = self.update_trackers.merge(&other.update_trackers);
+		changed = changed || c;
+
+		// Merge staged layout changes
+		if self.staging != other.staging {
+			let prev_staging = self.staging.clone();
+			self.staging.merge(&other.staging);
+			changed = changed || self.staging != prev_staging;
+		}
+
+		changed
+	}
+
+	pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> {
+		match version {
+			None => {
+				let error = r#"
+Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
+To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
+				"#;
+				return Err(Error::Message(error.into()));
+			}
+			Some(v) => {
+				if v != self.current().version + 1 {
+					return Err(Error::Message("Invalid new layout version".into()));
+				}
+			}
+		}
+
+		// Compute new version and add it to history
+		let (new_version, msg) = self
+			.current()
+			.clone()
+			.calculate_next_version(self.staging.get())?;
+
+		self.versions.push(new_version);
+		self.cleanup_old_versions();
+
+		// Reset the staged layout changes
+		self.staging.update(LayoutStaging {
+			parameters: self.staging.get().parameters.clone(),
+			roles: LwwMap::new(),
+		});
+
+		Ok((self, msg))
+	}
+
+	pub fn revert_staged_changes(mut self) -> Result<Self, Error> {
+		self.staging.update(LayoutStaging {
+			parameters: Lww::new(self.current().parameters),
+			roles: LwwMap::new(),
+		});
+
+		Ok(self)
+	}
+
+	pub fn check(&self) -> Result<(), String> {
+		// TODO: anything more ?
+		self.current().check()
+	}
+}
--- a/src/rpc/layout/manager.rs
+++ b/src/rpc/layout/manager.rs
@ -0,0 +1,378 @@
+use std::collections::HashMap;
+use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard};
+use std::time::Duration;
+
+use tokio::sync::Notify;
+
+use netapp::endpoint::Endpoint;
+use netapp::peering::fullmesh::FullMeshPeeringStrategy;
+use netapp::NodeID;
+
+use garage_util::config::Config;
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::persister::Persister;
+
+use super::*;
+use crate::replication_mode::ReplicationMode;
+use crate::rpc_helper::*;
+use crate::system::*;
+
+pub struct LayoutManager {
+	node_id: Uuid,
+	replication_mode: ReplicationMode,
+	persist_cluster_layout: Persister<LayoutHistory>,
+
+	layout: Arc<RwLock<LayoutHelper>>,
+	pub(crate) change_notify: Arc<Notify>,
+
+	table_sync_version: Mutex<HashMap<String, u64>>,
+
+	pub(crate) rpc_helper: RpcHelper,
+	system_endpoint: Arc<Endpoint<SystemRpc, System>>,
+}
+
+impl LayoutManager {
+	pub fn new(
+		config: &Config,
+		node_id: NodeID,
+		system_endpoint: Arc<Endpoint<SystemRpc, System>>,
+		fullmesh: Arc<FullMeshPeeringStrategy>,
+		replication_mode: ReplicationMode,
+	) -> Result<Arc<Self>, Error> {
+		let replication_factor = replication_mode.replication_factor();
+
+		let persist_cluster_layout: Persister<LayoutHistory> =
+			Persister::new(&config.metadata_dir, "cluster_layout");
+
+		let cluster_layout = match persist_cluster_layout.load() {
+			Ok(x) => {
+				if x.current().replication_factor != replication_mode.replication_factor() {
+					return Err(Error::Message(format!(
+						"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
+						x.current().replication_factor,
+						replication_factor
+					)));
+				}
+				x
+			}
+			Err(e) => {
+				info!(
+					"No valid previous cluster layout stored ({}), starting fresh.",
+					e
+				);
+				LayoutHistory::new(replication_factor)
+			}
+		};
+
+		let mut cluster_layout =
+			LayoutHelper::new(replication_mode, cluster_layout, Default::default());
+		cluster_layout.update_trackers(node_id.into());
+
+		let layout = Arc::new(RwLock::new(cluster_layout));
+		let change_notify = Arc::new(Notify::new());
+
+		let rpc_helper = RpcHelper::new(
+			node_id.into(),
+			fullmesh,
+			layout.clone(),
+			config.rpc_timeout_msec.map(Duration::from_millis),
+		);
+
+		Ok(Arc::new(Self {
+			node_id: node_id.into(),
+			replication_mode,
+			persist_cluster_layout,
+			layout,
+			change_notify,
+			table_sync_version: Mutex::new(HashMap::new()),
+			system_endpoint,
+			rpc_helper,
+		}))
+	}
+
+	// ---- PUBLIC INTERFACE ----
+
+	pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHelper> {
+		self.layout.read().unwrap()
+	}
+
+	pub async fn update_cluster_layout(
+		self: &Arc<Self>,
+		layout: &LayoutHistory,
+	) -> Result<(), Error> {
+		self.handle_advertise_cluster_layout(layout).await?;
+		Ok(())
+	}
+
+	pub fn add_table(&self, table_name: &'static str) {
+		let first_version = self.layout().versions.first().unwrap().version;
+
+		self.table_sync_version
+			.lock()
+			.unwrap()
+			.insert(table_name.to_string(), first_version);
+	}
+
+	pub fn sync_table_until(self: &Arc<Self>, table_name: &'static str, version: u64) {
+		let mut table_sync_version = self.table_sync_version.lock().unwrap();
+		*table_sync_version.get_mut(table_name).unwrap() = version;
+		let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap();
+		drop(table_sync_version);
+
+		let mut layout = self.layout.write().unwrap();
+		if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) {
+			info!("sync_until updated to {}", sync_until);
+			self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
+				layout.update_trackers.clone(),
+			));
+		}
+	}
+
+	fn ack_new_version(self: &Arc<Self>) {
+		let mut layout = self.layout.write().unwrap();
+		if layout.ack_max_free(self.node_id) {
+			self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
+				layout.update_trackers.clone(),
+			));
+		}
+	}
+
+	// ---- ACK LOCKING ----
+
+	pub fn write_sets_of(self: &Arc<Self>, position: &Hash) -> WriteLock<Vec<Vec<Uuid>>> {
+		let layout = self.layout();
+		let version = layout.current().version;
+		let nodes = layout.storage_sets_of(position);
+		layout
+			.ack_lock
+			.get(&version)
+			.unwrap()
+			.fetch_add(1, Ordering::Relaxed);
+		WriteLock::new(version, self, nodes)
+	}
+
+	// ---- INTERNALS ---
+
+	fn merge_layout(&self, adv: &LayoutHistory) -> Option<LayoutHistory> {
+		let mut layout = self.layout.write().unwrap();
+		let prev_digest = layout.digest();
+		let prev_layout_check = layout.check().is_ok();
+
+		if !prev_layout_check || adv.check().is_ok() {
+			if layout.update(|l| l.merge(adv)) {
+				layout.update_trackers(self.node_id);
+				if prev_layout_check && layout.check().is_err() {
+					panic!("Merged two correct layouts and got an incorrect layout.");
+				}
+				assert!(layout.digest() != prev_digest);
+				return Some(layout.clone());
+			}
+		}
+
+		None
+	}
+
+	fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option<UpdateTrackers> {
+		let mut layout = self.layout.write().unwrap();
+		let prev_digest = layout.digest();
+
+		if layout.update_trackers != *adv {
+			if layout.update(|l| l.update_trackers.merge(adv)) {
+				layout.update_trackers(self.node_id);
+				assert!(layout.digest() != prev_digest);
+				return Some(layout.update_trackers.clone());
+			}
+		}
+
+		None
+	}
+
+	async fn pull_cluster_layout(self: &Arc<Self>, peer: Uuid) {
+		let resp = self
+			.rpc_helper
+			.call(
+				&self.system_endpoint,
+				peer,
+				SystemRpc::PullClusterLayout,
+				RequestStrategy::with_priority(PRIO_HIGH),
+			)
+			.await;
+		if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
+			if let Err(e) = self.handle_advertise_cluster_layout(&layout).await {
+				warn!("In pull_cluster_layout: {}", e);
+			}
+		}
+	}
+
+	async fn pull_cluster_layout_trackers(self: &Arc<Self>, peer: Uuid) {
+		let resp = self
+			.rpc_helper
+			.call(
+				&self.system_endpoint,
+				peer,
+				SystemRpc::PullClusterLayoutTrackers,
+				RequestStrategy::with_priority(PRIO_HIGH),
+			)
+			.await;
+		if let Ok(SystemRpc::AdvertiseClusterLayoutTrackers(trackers)) = resp {
+			if let Err(e) = self
+				.handle_advertise_cluster_layout_trackers(&trackers)
+				.await
+			{
+				warn!("In pull_cluster_layout_trackers: {}", e);
+			}
+		}
+	}
+
+	/// Save cluster layout data to disk
+	async fn save_cluster_layout(&self) -> Result<(), Error> {
+		let layout = self.layout.read().unwrap().clone();
+		self.persist_cluster_layout
+			.save_async(&layout)
+			.await
+			.expect("Cannot save current cluster layout");
+		Ok(())
+	}
+
+	fn broadcast_update(self: &Arc<Self>, rpc: SystemRpc) {
+		tokio::spawn({
+			let this = self.clone();
+			async move {
+				if let Err(e) = this
+					.rpc_helper
+					.broadcast(
+						&this.system_endpoint,
+						rpc,
+						RequestStrategy::with_priority(PRIO_HIGH),
+					)
+					.await
+				{
+					warn!("Error while broadcasting new cluster layout: {}", e);
+				}
+			}
+		});
+	}
+
+	// ---- RPC HANDLERS ----
+
+	pub(crate) fn handle_advertise_status(self: &Arc<Self>, from: Uuid, remote: &RpcLayoutDigest) {
+		let local = self.layout().digest();
+		if remote.current_version > local.current_version
+			|| remote.active_versions != local.active_versions
+			|| remote.staging_hash != local.staging_hash
+		{
+			tokio::spawn({
+				let this = self.clone();
+				async move { this.pull_cluster_layout(from).await }
+			});
+		} else if remote.trackers_hash != local.trackers_hash {
+			tokio::spawn({
+				let this = self.clone();
+				async move { this.pull_cluster_layout_trackers(from).await }
+			});
+		}
+	}
+
+	pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc {
+		let layout = self.layout.read().unwrap().clone();
+		SystemRpc::AdvertiseClusterLayout(layout)
+	}
+
+	pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc {
+		let layout = self.layout.read().unwrap();
+		SystemRpc::AdvertiseClusterLayoutTrackers(layout.update_trackers.clone())
+	}
+
+	pub(crate) async fn handle_advertise_cluster_layout(
+		self: &Arc<Self>,
+		adv: &LayoutHistory,
+	) -> Result<SystemRpc, Error> {
+		debug!(
+			"handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}",
+			adv.versions.len(),
+			adv.current().version,
+			adv.update_trackers
+		);
+
+		if adv.current().replication_factor != self.replication_mode.replication_factor() {
+			let msg = format!(
+				"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
+				adv.current().replication_factor,
+				self.replication_mode.replication_factor()
+			);
+			error!("{}", msg);
+			return Err(Error::Message(msg));
+		}
+
+		if let Some(new_layout) = self.merge_layout(adv) {
+			debug!("handle_advertise_cluster_layout: some changes were added to the current stuff");
+
+			self.change_notify.notify_waiters();
+			self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout));
+			self.save_cluster_layout().await?;
+		}
+
+		Ok(SystemRpc::Ok)
+	}
+
+	pub(crate) async fn handle_advertise_cluster_layout_trackers(
+		self: &Arc<Self>,
+		trackers: &UpdateTrackers,
+	) -> Result<SystemRpc, Error> {
+		debug!("handle_advertise_cluster_layout_trackers: {:?}", trackers);
+
+		if let Some(new_trackers) = self.merge_layout_trackers(trackers) {
+			self.change_notify.notify_waiters();
+			self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers));
+			self.save_cluster_layout().await?;
+		}
+
+		Ok(SystemRpc::Ok)
+	}
+}
+
+// ---- ack lock ----
+
+pub struct WriteLock<T> {
+	layout_version: u64,
+	layout_manager: Arc<LayoutManager>,
+	value: T,
+}
+
+impl<T> WriteLock<T> {
+	fn new(version: u64, layout_manager: &Arc<LayoutManager>, value: T) -> Self {
+		Self {
+			layout_version: version,
+			layout_manager: layout_manager.clone(),
+			value,
+		}
+	}
+}
+
+impl<T> AsRef<T> for WriteLock<T> {
+	fn as_ref(&self) -> &T {
+		&self.value
+	}
+}
+
+impl<T> AsMut<T> for WriteLock<T> {
+	fn as_mut(&mut self) -> &mut T {
+		&mut self.value
+	}
+}
+
+impl<T> Drop for WriteLock<T> {
+	fn drop(&mut self) {
+		let layout = self.layout_manager.layout(); // acquire read lock
+		if let Some(counter) = layout.ack_lock.get(&self.layout_version) {
+			let prev_lock = counter.fetch_sub(1, Ordering::Relaxed);
+			if prev_lock == 1 && layout.current().version > self.layout_version {
+				drop(layout); // release read lock, write lock will be acquired
+				self.layout_manager.ack_new_version();
+			}
+		} else {
+			error!("Could not find ack lock counter for layout version {}. This probably indicates a bug in Garage.", self.layout_version);
+		}
+	}
+}
--- a/src/rpc/layout/mod.rs
+++ b/src/rpc/layout/mod.rs
@ -0,0 +1,478 @@
+use std::fmt;
+
+use bytesize::ByteSize;
+
+use garage_util::crdt::{AutoCrdt, Crdt};
+use garage_util::data::Uuid;
+
+mod graph_algo;
+mod helper;
+mod history;
+mod version;
+
+#[cfg(test)]
+mod test;
+
+pub mod manager;
+
+// ---- re-exports ----
+
+pub use helper::{LayoutHelper, RpcLayoutDigest, SyncLayoutDigest};
+pub use manager::WriteLock;
+pub use version::*;
+
+// ---- defines: partitions ----
+
+/// A partition id, which is stored on 16 bits
+/// i.e. we have up to 2**16 partitions.
+/// (in practice we have exactly 2**PARTITION_BITS partitions)
+pub type Partition = u16;
+
+// TODO: make this constant parametrizable in the config file
+// For deployments with many nodes it might make sense to bump
+// it up to 10.
+// Maximum value : 16
+/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in
+/// presence of numerous nodes, but exponentially bigger ring. Max 16
+pub const PARTITION_BITS: usize = 8;
+
+const NB_PARTITIONS: usize = 1usize << PARTITION_BITS;
+
+// ---- defines: nodes ----
+
+// Type to store compactly the id of a node in the system
+// Change this to u16 the day we want to have more than 256 nodes in a cluster
+pub type CompactNodeType = u8;
+pub const MAX_NODE_NUMBER: usize = 256;
+
+// ======== actual data structures for the layout data ========
+// ======== that is persisted to disk                  ========
+// some small utility impls are at the end of this file,
+// but most of the code that actually computes stuff is in
+// version.rs, history.rs and helper.rs
+
+mod v08 {
+	use crate::layout::CompactNodeType;
+	use garage_util::crdt::LwwMap;
+	use garage_util::data::{Hash, Uuid};
+	use serde::{Deserialize, Serialize};
+
+	/// The layout of the cluster, i.e. the list of roles
+	/// which are assigned to each cluster node
+	#[derive(Clone, Debug, Serialize, Deserialize)]
+	pub struct ClusterLayout {
+		pub version: u64,
+
+		pub replication_factor: usize,
+		pub roles: LwwMap<Uuid, NodeRoleV>,
+
+		// see comments in v010::ClusterLayout
+		pub node_id_vec: Vec<Uuid>,
+		#[serde(with = "serde_bytes")]
+		pub ring_assignation_data: Vec<CompactNodeType>,
+
+		/// Role changes which are staged for the next version of the layout
+		pub staging: LwwMap<Uuid, NodeRoleV>,
+		pub staging_hash: Hash,
+	}
+
+	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+	pub struct NodeRoleV(pub Option<NodeRole>);
+
+	/// The user-assigned roles of cluster nodes
+	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+	pub struct NodeRole {
+		/// Datacenter at which this entry belong. This information is used to
+		/// perform a better geodistribution
+		pub zone: String,
+		/// The capacity of the node
+		/// If this is set to None, the node does not participate in storing data for the system
+		/// and is only active as an API gateway to other nodes
+		pub capacity: Option<u64>,
+		/// A set of tags to recognize the node
+		pub tags: Vec<String>,
+	}
+
+	impl garage_util::migrate::InitialFormat for ClusterLayout {}
+}
+
+mod v09 {
+	use super::v08;
+	use crate::layout::CompactNodeType;
+	use garage_util::crdt::{Lww, LwwMap};
+	use garage_util::data::{Hash, Uuid};
+	use serde::{Deserialize, Serialize};
+	pub use v08::{NodeRole, NodeRoleV};
+
+	/// The layout of the cluster, i.e. the list of roles
+	/// which are assigned to each cluster node
+	#[derive(Clone, Debug, Serialize, Deserialize)]
+	pub struct ClusterLayout {
+		pub version: u64,
+
+		pub replication_factor: usize,
+
+		/// This attribute is only used to retain the previously computed partition size,
+		/// to know to what extent does it change with the layout update.
+		pub partition_size: u64,
+		/// Parameters used to compute the assignment currently given by
+		/// ring_assignment_data
+		pub parameters: LayoutParameters,
+
+		pub roles: LwwMap<Uuid, NodeRoleV>,
+
+		// see comments in v010::ClusterLayout
+		pub node_id_vec: Vec<Uuid>,
+		#[serde(with = "serde_bytes")]
+		pub ring_assignment_data: Vec<CompactNodeType>,
+
+		/// Parameters to be used in the next partition assignment computation.
+		pub staging_parameters: Lww<LayoutParameters>,
+		/// Role changes which are staged for the next version of the layout
+		pub staging_roles: LwwMap<Uuid, NodeRoleV>,
+		pub staging_hash: Hash,
+	}
+
+	/// This struct is used to set the parameters to be used in the assignment computation
+	/// algorithm. It is stored as a Crdt.
+	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
+	pub struct LayoutParameters {
+		pub zone_redundancy: ZoneRedundancy,
+	}
+
+	/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
+	/// of each partition on at least that number of different zones.
+	/// Otherwise, copies will be stored on the maximum possible number of zones.
+	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
+	pub enum ZoneRedundancy {
+		AtLeast(usize),
+		Maximum,
+	}
+
+	impl garage_util::migrate::Migrate for ClusterLayout {
+		const VERSION_MARKER: &'static [u8] = b"G09layout";
+
+		type Previous = v08::ClusterLayout;
+
+		fn migrate(previous: Self::Previous) -> Self {
+			use itertools::Itertools;
+
+			// In the old layout, capacities are in an arbitrary unit,
+			// but in the new layout they are in bytes.
+			// Here we arbitrarily multiply everything by 1G,
+			// such that 1 old capacity unit = 1GB in the new units.
+			// This is totally arbitrary and won't work for most users.
+			let cap_mul = 1024 * 1024 * 1024;
+			let roles = multiply_all_capacities(previous.roles, cap_mul);
+			let staging_roles = multiply_all_capacities(previous.staging, cap_mul);
+			let node_id_vec = previous.node_id_vec;
+
+			// Determine partition size
+			let mut tmp = previous.ring_assignation_data.clone();
+			tmp.sort();
+			let partition_size = tmp
+				.into_iter()
+				.dedup_with_count()
+				.map(|(npart, node)| {
+					roles
+						.get(&node_id_vec[node as usize])
+						.and_then(|p| p.0.as_ref().and_then(|r| r.capacity))
+						.unwrap_or(0) / npart as u64
+				})
+				.min()
+				.unwrap_or(0);
+
+			// By default, zone_redundancy is maximum possible value
+			let parameters = LayoutParameters {
+				zone_redundancy: ZoneRedundancy::Maximum,
+			};
+
+			Self {
+				version: previous.version,
+				replication_factor: previous.replication_factor,
+				partition_size,
+				parameters,
+				roles,
+				node_id_vec,
+				ring_assignment_data: previous.ring_assignation_data,
+				staging_parameters: Lww::new(parameters),
+				staging_roles,
+				staging_hash: [0u8; 32].into(), // will be set in the next migration
+			}
+		}
+	}
+
+	fn multiply_all_capacities(
+		old_roles: LwwMap<Uuid, NodeRoleV>,
+		mul: u64,
+	) -> LwwMap<Uuid, NodeRoleV> {
+		let mut new_roles = LwwMap::new();
+		for (node, ts, role) in old_roles.items() {
+			let mut role = role.clone();
+			if let NodeRoleV(Some(NodeRole {
+				capacity: Some(ref mut cap),
+				..
+			})) = role
+			{
+				*cap *= mul;
+			}
+			new_roles.merge_raw(node, *ts, &role);
+		}
+		new_roles
+	}
+}
+
+mod v010 {
+	use super::v09;
+	use crate::layout::CompactNodeType;
+	use garage_util::crdt::{Lww, LwwMap};
+	use garage_util::data::Uuid;
+	use serde::{Deserialize, Serialize};
+	use std::collections::BTreeMap;
+	pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy};
+
+	/// Number of old (non-live) versions to keep, see LayoutHistory::old_versions
+	pub const OLD_VERSION_COUNT: usize = 5;
+
+	/// The history of cluster layouts, with trackers to keep a record
+	/// of which nodes are up-to-date to current cluster data
+	#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+	pub struct LayoutHistory {
+		/// The versions currently in use in the cluster
+		pub versions: Vec<LayoutVersion>,
+		/// At most 5 of the previous versions, not used by the garage_table
+		/// module, but usefull for the garage_block module to find data blocks
+		/// that have not yet been moved
+		pub old_versions: Vec<LayoutVersion>,
+
+		/// Update trackers
+		pub update_trackers: UpdateTrackers,
+
+		/// Staged changes for the next version
+		pub staging: Lww<LayoutStaging>,
+	}
+
+	/// A version of the layout of the cluster, i.e. the list of roles
+	/// which are assigned to each cluster node
+	#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+	pub struct LayoutVersion {
+		/// The number of this version
+		pub version: u64,
+
+		/// Roles assigned to nodes in this version
+		pub roles: LwwMap<Uuid, NodeRoleV>,
+		/// Parameters used to compute the assignment currently given by
+		/// ring_assignment_data
+		pub parameters: LayoutParameters,
+
+		/// The number of replicas for each data partition
+		pub replication_factor: usize,
+		/// This attribute is only used to retain the previously computed partition size,
+		/// to know to what extent does it change with the layout update.
+		pub partition_size: u64,
+
+		/// node_id_vec: a vector of node IDs with a role assigned
+		/// in the system (this includes gateway nodes).
+		/// The order here is different than the vec stored by `roles`, because:
+		/// 1. non-gateway nodes are first so that they have lower numbers
+		/// 2. nodes that don't have a role are excluded (but they need to
+		///    stay in the CRDT as tombstones)
+		pub node_id_vec: Vec<Uuid>,
+		/// number of non-gateway nodes, which are the first ids in node_id_vec
+		pub nongateway_node_count: usize,
+		/// The assignation of data partitions to nodes, the values
+		/// are indices in node_id_vec
+		#[serde(with = "serde_bytes")]
+		pub ring_assignment_data: Vec<CompactNodeType>,
+	}
+
+	/// The staged changes for the next layout version
+	#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+	pub struct LayoutStaging {
+		/// Parameters to be used in the next partition assignment computation.
+		pub parameters: Lww<LayoutParameters>,
+		/// Role changes which are staged for the next version of the layout
+		pub roles: LwwMap<Uuid, NodeRoleV>,
+	}
+
+	/// The tracker of acknowlegments and data syncs around the cluster
+	#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)]
+	pub struct UpdateTrackers {
+		/// The highest layout version number each node has ack'ed
+		pub ack_map: UpdateTracker,
+		/// The highest layout version number each node has synced data for
+		pub sync_map: UpdateTracker,
+		/// The highest layout version number each node has
+		/// ack'ed that all other nodes have synced data for
+		pub sync_ack_map: UpdateTracker,
+	}
+
+	/// Generic update tracker struct
+	#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)]
+	pub struct UpdateTracker(pub BTreeMap<Uuid, u64>);
+
+	impl garage_util::migrate::Migrate for LayoutHistory {
+		const VERSION_MARKER: &'static [u8] = b"G010lh";
+
+		type Previous = v09::ClusterLayout;
+
+		fn migrate(previous: Self::Previous) -> Self {
+			let nongateway_node_count = previous
+				.node_id_vec
+				.iter()
+				.enumerate()
+				.filter(|(_, uuid)| {
+					let role = previous.roles.get(uuid);
+					matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some())
+				})
+				.map(|(i, _)| i + 1)
+				.max()
+				.unwrap_or(0);
+
+			let version = LayoutVersion {
+				version: previous.version,
+				replication_factor: previous.replication_factor,
+				partition_size: previous.partition_size,
+				parameters: previous.parameters,
+				roles: previous.roles,
+				node_id_vec: previous.node_id_vec,
+				nongateway_node_count,
+				ring_assignment_data: previous.ring_assignment_data,
+			};
+			let update_tracker = UpdateTracker(
+				version
+					.nongateway_nodes()
+					.iter()
+					.copied()
+					.map(|x| (x, version.version))
+					.collect::<BTreeMap<Uuid, u64>>(),
+			);
+			let staging = LayoutStaging {
+				parameters: previous.staging_parameters,
+				roles: previous.staging_roles,
+			};
+			Self {
+				versions: vec![version],
+				old_versions: vec![],
+				update_trackers: UpdateTrackers {
+					ack_map: update_tracker.clone(),
+					sync_map: update_tracker.clone(),
+					sync_ack_map: update_tracker,
+				},
+				staging: Lww::raw(previous.version, staging),
+			}
+		}
+	}
+}
+
+pub use v010::*;
+
+// ---- utility functions ----
+
+impl AutoCrdt for LayoutParameters {
+	const WARN_IF_DIFFERENT: bool = true;
+}
+
+impl AutoCrdt for NodeRoleV {
+	const WARN_IF_DIFFERENT: bool = true;
+}
+
+impl Crdt for LayoutStaging {
+	fn merge(&mut self, other: &LayoutStaging) {
+		self.parameters.merge(&other.parameters);
+		self.roles.merge(&other.roles);
+	}
+}
+
+impl NodeRole {
+	pub fn capacity_string(&self) -> String {
+		match self.capacity {
+			Some(c) => ByteSize::b(c).to_string_as(false),
+			None => "gateway".to_string(),
+		}
+	}
+
+	pub fn tags_string(&self) -> String {
+		self.tags.join(",")
+	}
+}
+
+impl fmt::Display for ZoneRedundancy {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		match self {
+			ZoneRedundancy::Maximum => write!(f, "maximum"),
+			ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
+		}
+	}
+}
+
+impl core::str::FromStr for ZoneRedundancy {
+	type Err = &'static str;
+	fn from_str(s: &str) -> Result<Self, Self::Err> {
+		match s {
+			"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
+			x => {
+				let v = x
+					.parse::<usize>()
+					.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
+				Ok(ZoneRedundancy::AtLeast(v))
+			}
+		}
+	}
+}
+
+impl UpdateTracker {
+	fn merge(&mut self, other: &UpdateTracker) -> bool {
+		let mut changed = false;
+		for (k, v) in other.0.iter() {
+			if let Some(v_mut) = self.0.get_mut(k) {
+				if *v > *v_mut {
+					*v_mut = *v;
+					changed = true;
+				}
+			} else {
+				self.0.insert(*k, *v);
+				changed = true;
+			}
+		}
+		changed
+	}
+
+	/// This bumps the update tracker for a given node up to the specified value.
+	/// This has potential impacts on the correctness of Garage and should only
+	/// be used in very specific circumstances.
+	pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool {
+		match self.0.get_mut(&peer) {
+			Some(e) if *e < value => {
+				*e = value;
+				true
+			}
+			None => {
+				self.0.insert(peer, value);
+				true
+			}
+			_ => false,
+		}
+	}
+
+	pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 {
+		storage_nodes
+			.iter()
+			.map(|x| self.get(x, min_version))
+			.min()
+			.unwrap_or(min_version)
+	}
+
+	pub fn get(&self, node: &Uuid, min_version: u64) -> u64 {
+		self.0.get(node).copied().unwrap_or(min_version)
+	}
+}
+
+impl UpdateTrackers {
+	pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool {
+		let c1 = self.ack_map.merge(&other.ack_map);
+		let c2 = self.sync_map.merge(&other.sync_map);
+		let c3 = self.sync_ack_map.merge(&other.sync_ack_map);
+		c1 || c2 || c3
+	}
+}
--- a/src/rpc/layout/test.rs
+++ b/src/rpc/layout/test.rs
@ -0,0 +1,157 @@
+use std::cmp::min;
+use std::collections::HashMap;
+
+use garage_util::crdt::Crdt;
+use garage_util::error::*;
+
+use crate::layout::*;
+
+// This function checks that the partition size S computed is at least better than the
+// one given by a very naive algorithm. To do so, we try to run the naive algorithm
+// assuming a partion size of S+1. If we succed, it means that the optimal assignment
+// was not optimal. The naive algorithm is the following :
+// - we compute the max number of partitions associated to every node, capped at the
+// partition number. It gives the number of tokens of every node.
+// - every zone has a number of tokens equal to the sum of the tokens of its nodes.
+// - we cycle over the partitions and associate zone tokens while respecting the
+// zone redundancy constraint.
+// NOTE: the naive algorithm is not optimal. Counter example:
+// take nb_partition = 3  ; replication_factor = 5; redundancy = 4;
+// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2)
+// With these parameters, the naive algo fails, whereas there is a solution:
+// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E)
+fn check_against_naive(cl: &LayoutVersion) -> Result<bool, Error> {
+	let over_size = cl.partition_size + 1;
+	let mut zone_token = HashMap::<String, usize>::new();
+
+	let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?;
+
+	if zones.is_empty() {
+		return Ok(false);
+	}
+
+	for z in zones.iter() {
+		zone_token.insert(z.clone(), 0);
+	}
+	for uuid in cl.nongateway_nodes() {
+		let z = cl.expect_get_node_zone(&uuid);
+		let c = cl.expect_get_node_capacity(&uuid);
+		zone_token.insert(
+			z.to_string(),
+			zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize),
+		);
+	}
+
+	// For every partition, we count the number of zone already associated and
+	// the name of the last zone associated
+
+	let mut id_zone_token = vec![0; zones.len()];
+	for (z, t) in zone_token.iter() {
+		id_zone_token[zone_to_id[z]] = *t;
+	}
+
+	let mut nb_token = vec![0; NB_PARTITIONS];
+	let mut last_zone = vec![zones.len(); NB_PARTITIONS];
+
+	let mut curr_zone = 0;
+
+	let redundancy = cl.effective_zone_redundancy();
+
+	for replic in 0..cl.replication_factor {
+		for p in 0..NB_PARTITIONS {
+			while id_zone_token[curr_zone] == 0
+				|| (last_zone[p] == curr_zone
+					&& redundancy - nb_token[p] <= cl.replication_factor - replic)
+			{
+				curr_zone += 1;
+				if curr_zone >= zones.len() {
+					return Ok(true);
+				}
+			}
+			id_zone_token[curr_zone] -= 1;
+			if last_zone[p] != curr_zone {
+				nb_token[p] += 1;
+				last_zone[p] = curr_zone;
+			}
+		}
+	}
+
+	return Ok(false);
+}
+
+fn show_msg(msg: &Message) {
+	for s in msg.iter() {
+		println!("{}", s);
+	}
+}
+
+fn update_layout(
+	cl: &mut LayoutHistory,
+	node_capacity_vec: &[u64],
+	node_zone_vec: &[&'static str],
+	zone_redundancy: usize,
+) {
+	let staging = cl.staging.get_mut();
+
+	for (i, (capacity, zone)) in node_capacity_vec
+		.iter()
+		.zip(node_zone_vec.iter())
+		.enumerate()
+	{
+		let node_id = [i as u8; 32].into();
+
+		let update = staging.roles.update_mutator(
+			node_id,
+			NodeRoleV(Some(NodeRole {
+				zone: zone.to_string(),
+				capacity: Some(*capacity),
+				tags: (vec![]),
+			})),
+		);
+		staging.roles.merge(&update);
+	}
+	staging.parameters.update(LayoutParameters {
+		zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
+	});
+}
+
+#[test]
+fn test_assignment() {
+	let mut node_capacity_vec = vec![4000, 1000, 2000];
+	let mut node_zone_vec = vec!["A", "B", "C"];
+
+	let mut cl = LayoutHistory::new(3);
+	update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
+	let v = cl.current().version;
+	let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
+	show_msg(&msg);
+	assert_eq!(cl.check(), Ok(()));
+	assert!(check_against_naive(cl.current()).unwrap());
+
+	node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000];
+	node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"];
+	update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2);
+	let v = cl.current().version;
+	let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
+	show_msg(&msg);
+	assert_eq!(cl.check(), Ok(()));
+	assert!(check_against_naive(cl.current()).unwrap());
+
+	node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000];
+	update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3);
+	let v = cl.current().version;
+	let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
+	show_msg(&msg);
+	assert_eq!(cl.check(), Ok(()));
+	assert!(check_against_naive(cl.current()).unwrap());
+
+	node_capacity_vec = vec![
+		4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000,
+	];
+	update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1);
+	let v = cl.current().version;
+	let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
+	show_msg(&msg);
+	assert_eq!(cl.check(), Ok(()));
+	assert!(check_against_naive(cl.current()).unwrap());
+}
--- a/src/rpc/layout/version.rs
+++ b/src/rpc/layout/version.rs
@ -1,375 +1,55 @@
-use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::collections::HashSet;
-use std::fmt;
+use std::convert::TryInto;

 use bytesize::ByteSize;
 use itertools::Itertools;

-use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap};
+use garage_util::crdt::{Crdt, LwwMap};
 use garage_util::data::*;
-use garage_util::encode::nonversioned_encode;
 use garage_util::error::*;

-use crate::graph_algo::*;
-
-use crate::ring::*;
-
-use std::convert::TryInto;
-
-const NB_PARTITIONS: usize = 1usize << PARTITION_BITS;
+use super::graph_algo::*;
+use super::*;

 // The Message type will be used to collect information on the algorithm.
-type Message = Vec<String>;
+pub type Message = Vec<String>;

-mod v08 {
-	use crate::ring::CompactNodeType;
-	use garage_util::crdt::LwwMap;
-	use garage_util::data::{Hash, Uuid};
-	use serde::{Deserialize, Serialize};
-
-	/// The layout of the cluster, i.e. the list of roles
-	/// which are assigned to each cluster node
-	#[derive(Clone, Debug, Serialize, Deserialize)]
-	pub struct ClusterLayout {
-		pub version: u64,
-
-		pub replication_factor: usize,
-		pub roles: LwwMap<Uuid, NodeRoleV>,
-
-		/// node_id_vec: a vector of node IDs with a role assigned
-		/// in the system (this includes gateway nodes).
-		/// The order here is different than the vec stored by `roles`, because:
-		/// 1. non-gateway nodes are first so that they have lower numbers
-		/// 2. nodes that don't have a role are excluded (but they need to
-		///    stay in the CRDT as tombstones)
-		pub node_id_vec: Vec<Uuid>,
-		/// the assignation of data partitions to node, the values
-		/// are indices in node_id_vec
-		#[serde(with = "serde_bytes")]
-		pub ring_assignation_data: Vec<CompactNodeType>,
-
-		/// Role changes which are staged for the next version of the layout
-		pub staging: LwwMap<Uuid, NodeRoleV>,
-		pub staging_hash: Hash,
-	}
-
-	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
-	pub struct NodeRoleV(pub Option<NodeRole>);
-
-	/// The user-assigned roles of cluster nodes
-	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
-	pub struct NodeRole {
-		/// Datacenter at which this entry belong. This information is used to
-		/// perform a better geodistribution
-		pub zone: String,
-		/// The capacity of the node
-		/// If this is set to None, the node does not participate in storing data for the system
-		/// and is only active as an API gateway to other nodes
-		pub capacity: Option<u64>,
-		/// A set of tags to recognize the node
-		pub tags: Vec<String>,
-	}
-
-	impl garage_util::migrate::InitialFormat for ClusterLayout {}
-}
-
-mod v09 {
-	use super::v08;
-	use crate::ring::CompactNodeType;
-	use garage_util::crdt::{Lww, LwwMap};
-	use garage_util::data::{Hash, Uuid};
-	use serde::{Deserialize, Serialize};
-	pub use v08::{NodeRole, NodeRoleV};
-
-	/// The layout of the cluster, i.e. the list of roles
-	/// which are assigned to each cluster node
-	#[derive(Clone, Debug, Serialize, Deserialize)]
-	pub struct ClusterLayout {
-		pub version: u64,
-
-		pub replication_factor: usize,
-
-		/// This attribute is only used to retain the previously computed partition size,
-		/// to know to what extent does it change with the layout update.
-		pub partition_size: u64,
-		/// Parameters used to compute the assignment currently given by
-		/// ring_assignment_data
-		pub parameters: LayoutParameters,
-
-		pub roles: LwwMap<Uuid, NodeRoleV>,
-
-		/// see comment in v08::ClusterLayout
-		pub node_id_vec: Vec<Uuid>,
-		/// see comment in v08::ClusterLayout
-		#[serde(with = "serde_bytes")]
-		pub ring_assignment_data: Vec<CompactNodeType>,
-
-		/// Parameters to be used in the next partition assignment computation.
-		pub staging_parameters: Lww<LayoutParameters>,
-		/// Role changes which are staged for the next version of the layout
-		pub staging_roles: LwwMap<Uuid, NodeRoleV>,
-		pub staging_hash: Hash,
-	}
-
-	/// This struct is used to set the parameters to be used in the assignment computation
-	/// algorithm. It is stored as a Crdt.
-	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
-	pub struct LayoutParameters {
-		pub zone_redundancy: ZoneRedundancy,
-	}
-
-	/// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies
-	/// of each partition on at least that number of different zones.
-	/// Otherwise, copies will be stored on the maximum possible number of zones.
-	#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
-	pub enum ZoneRedundancy {
-		AtLeast(usize),
-		Maximum,
-	}
-
-	impl garage_util::migrate::Migrate for ClusterLayout {
-		const VERSION_MARKER: &'static [u8] = b"G09layout";
-
-		type Previous = v08::ClusterLayout;
-
-		fn migrate(previous: Self::Previous) -> Self {
-			use itertools::Itertools;
-
-			// In the old layout, capacities are in an arbitrary unit,
-			// but in the new layout they are in bytes.
-			// Here we arbitrarily multiply everything by 1G,
-			// such that 1 old capacity unit = 1GB in the new units.
-			// This is totally arbitrary and won't work for most users.
-			let cap_mul = 1024 * 1024 * 1024;
-			let roles = multiply_all_capacities(previous.roles, cap_mul);
-			let staging_roles = multiply_all_capacities(previous.staging, cap_mul);
-			let node_id_vec = previous.node_id_vec;
-
-			// Determine partition size
-			let mut tmp = previous.ring_assignation_data.clone();
-			tmp.sort();
-			let partition_size = tmp
-				.into_iter()
-				.dedup_with_count()
-				.map(|(npart, node)| {
-					roles
-						.get(&node_id_vec[node as usize])
-						.and_then(|p| p.0.as_ref().and_then(|r| r.capacity))
-						.unwrap_or(0) / npart as u64
-				})
-				.min()
-				.unwrap_or(0);
-
-			// By default, zone_redundancy is maximum possible value
-			let parameters = LayoutParameters {
-				zone_redundancy: ZoneRedundancy::Maximum,
-			};
-
-			let mut res = Self {
-				version: previous.version,
-				replication_factor: previous.replication_factor,
-				partition_size,
-				parameters,
-				roles,
-				node_id_vec,
-				ring_assignment_data: previous.ring_assignation_data,
-				staging_parameters: Lww::new(parameters),
-				staging_roles,
-				staging_hash: [0u8; 32].into(),
-			};
-			res.staging_hash = res.calculate_staging_hash();
-			res
-		}
-	}
-
-	fn multiply_all_capacities(
-		old_roles: LwwMap<Uuid, NodeRoleV>,
-		mul: u64,
-	) -> LwwMap<Uuid, NodeRoleV> {
-		let mut new_roles = LwwMap::new();
-		for (node, ts, role) in old_roles.items() {
-			let mut role = role.clone();
-			if let NodeRoleV(Some(NodeRole {
-				capacity: Some(ref mut cap),
-				..
-			})) = role
-			{
-				*cap *= mul;
-			}
-			new_roles.merge_raw(node, *ts, &role);
-		}
-		new_roles
-	}
-}
-
-pub use v09::*;
-
-impl AutoCrdt for LayoutParameters {
-	const WARN_IF_DIFFERENT: bool = true;
-}
-
-impl AutoCrdt for NodeRoleV {
-	const WARN_IF_DIFFERENT: bool = true;
-}
-
-impl NodeRole {
-	pub fn capacity_string(&self) -> String {
-		match self.capacity {
-			Some(c) => ByteSize::b(c).to_string_as(false),
-			None => "gateway".to_string(),
-		}
-	}
-
-	pub fn tags_string(&self) -> String {
-		self.tags.join(",")
-	}
-}
-
-impl fmt::Display for ZoneRedundancy {
-	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-		match self {
-			ZoneRedundancy::Maximum => write!(f, "maximum"),
-			ZoneRedundancy::AtLeast(x) => write!(f, "{}", x),
-		}
-	}
-}
-
-impl core::str::FromStr for ZoneRedundancy {
-	type Err = &'static str;
-	fn from_str(s: &str) -> Result<Self, Self::Err> {
-		match s {
-			"none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum),
-			x => {
-				let v = x
-					.parse::<usize>()
-					.map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?;
-				Ok(ZoneRedundancy::AtLeast(v))
-			}
-		}
-	}
-}
-
-// Implementation of the ClusterLayout methods unrelated to the assignment algorithm.
-impl ClusterLayout {
+impl LayoutVersion {
 	pub fn new(replication_factor: usize) -> Self {
 		// We set the default zone redundancy to be Maximum, meaning that the maximum
 		// possible value will be used depending on the cluster topology
 		let parameters = LayoutParameters {
 			zone_redundancy: ZoneRedundancy::Maximum,
 		};
-		let staging_parameters = Lww::<LayoutParameters>::new(parameters);

-		let empty_lwwmap = LwwMap::new();
-
-		let mut ret = ClusterLayout {
+		LayoutVersion {
 			version: 0,
 			replication_factor,
 			partition_size: 0,
 			roles: LwwMap::new(),
 			node_id_vec: Vec::new(),
+			nongateway_node_count: 0,
 			ring_assignment_data: Vec::new(),
 			parameters,
-			staging_parameters,
-			staging_roles: empty_lwwmap,
-			staging_hash: [0u8; 32].into(),
-		};
-		ret.staging_hash = ret.calculate_staging_hash();
-		ret
-	}
-
-	fn calculate_staging_hash(&self) -> Hash {
-		let hashed_tuple = (&self.staging_roles, &self.staging_parameters);
-		blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..])
-	}
-
-	pub fn merge(&mut self, other: &ClusterLayout) -> bool {
-		match other.version.cmp(&self.version) {
-			Ordering::Greater => {
-				*self = other.clone();
-				true
-			}
-			Ordering::Equal => {
-				self.staging_parameters.merge(&other.staging_parameters);
-				self.staging_roles.merge(&other.staging_roles);
-
-				let new_staging_hash = self.calculate_staging_hash();
-				let changed = new_staging_hash != self.staging_hash;
-
-				self.staging_hash = new_staging_hash;
-
-				changed
-			}
-			Ordering::Less => false,
 		}
 	}

-	pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> {
-		match version {
-			None => {
-				let error = r#"
-Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
-To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
-				"#;
-				return Err(Error::Message(error.into()));
-			}
-			Some(v) => {
-				if v != self.version + 1 {
-					return Err(Error::Message("Invalid new layout version".into()));
-				}
-			}
-		}
+	// ===================== accessors ======================

-		self.roles.merge(&self.staging_roles);
-		self.roles.retain(|(_, _, v)| v.0.is_some());
-		self.parameters = *self.staging_parameters.get();
-
-		self.staging_roles.clear();
-		self.staging_hash = self.calculate_staging_hash();
-
-		let msg = self.calculate_partition_assignment()?;
-
-		self.version += 1;
-
-		Ok((self, msg))
-	}
-
-	pub fn revert_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
-		match version {
-			None => {
-				let error = r#"
-Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
-To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
-				"#;
-				return Err(Error::Message(error.into()));
-			}
-			Some(v) => {
-				if v != self.version + 1 {
-					return Err(Error::Message("Invalid new layout version".into()));
-				}
-			}
-		}
-
-		self.staging_roles.clear();
-		self.staging_parameters.update(self.parameters);
-		self.staging_hash = self.calculate_staging_hash();
-
-		self.version += 1;
-
-		Ok(self)
-	}
-
-	/// Returns a list of IDs of nodes that currently have
-	/// a role in the cluster
-	pub fn node_ids(&self) -> &[Uuid] {
+	/// Returns a list of IDs of nodes that have a role in this
+	/// version of the cluster layout, including gateway nodes
+	pub fn all_nodes(&self) -> &[Uuid] {
 		&self.node_id_vec[..]
 	}

-	pub fn num_nodes(&self) -> usize {
-		self.node_id_vec.len()
+	/// Returns a list of IDs of nodes that have a storage capacity
+	/// assigned in this version of the cluster layout
+	pub fn nongateway_nodes(&self) -> &[Uuid] {
+		&self.node_id_vec[..self.nongateway_node_count]
 	}

-	/// Returns the role of a node in the layout
+	/// Returns the role of a node in the layout, if it has one
 	pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> {
 		match self.roles.get(node) {
 			Some(NodeRoleV(Some(v))) => Some(v),
@ -377,41 +57,23 @@ To know the correct value of the new layout version, invoke `garage layout show`
 		}
 	}

-	/// Returns the uuids of the non_gateway nodes in self.node_id_vec.
-	fn nongateway_nodes(&self) -> Vec<Uuid> {
-		let mut result = Vec::<Uuid>::new();
-		for uuid in self.node_id_vec.iter() {
-			match self.node_role(uuid) {
-				Some(role) if role.capacity.is_some() => result.push(*uuid),
-				_ => (),
-			}
-		}
-		result
-	}
-
-	/// Given a node uuids, this function returns the label of its zone
-	fn get_node_zone(&self, uuid: &Uuid) -> Result<String, Error> {
-		match self.node_role(uuid) {
-			Some(role) => Ok(role.zone.clone()),
-			_ => Err(Error::Message(
-				"The Uuid does not correspond to a node present in the cluster.".into(),
-			)),
-		}
-	}
-
-	/// Given a node uuids, this function returns its capacity or fails if it does not have any
-	pub fn get_node_capacity(&self, uuid: &Uuid) -> Result<u64, Error> {
+	/// Returns the capacity of a node in the layout, if it has one
+	pub fn get_node_capacity(&self, uuid: &Uuid) -> Option<u64> {
 		match self.node_role(uuid) {
 			Some(NodeRole {
 				capacity: Some(cap),
 				zone: _,
 				tags: _,
-			}) => Ok(*cap),
-			_ => Err(Error::Message(
-				"The Uuid does not correspond to a node present in the \
-                    cluster or this node does not have a positive capacity."
-					.into(),
-			)),
+			}) => Some(*cap),
+			_ => None,
+		}
+	}
+
+	/// Given a node uuids, this function returns the label of its zone if it has one
+	pub fn get_node_zone(&self, uuid: &Uuid) -> Option<&str> {
+		match self.node_role(uuid) {
+			Some(role) => Some(&role.zone),
+			_ => None,
 		}
 	}

@ -435,17 +97,65 @@ To know the correct value of the new layout version, invoke `garage layout show`
 		))
 	}

-	/// Returns the sum of capacities of non gateway nodes in the cluster
-	fn get_total_capacity(&self) -> Result<u64, Error> {
-		let mut total_capacity = 0;
-		for uuid in self.nongateway_nodes().iter() {
-			total_capacity += self.get_node_capacity(uuid)?;
+	/// Get the partition in which data would fall on
+	pub fn partition_of(&self, position: &Hash) -> Partition {
+		let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap());
+		top >> (16 - PARTITION_BITS)
 	}
-		Ok(total_capacity)
+
+	/// Get the list of partitions and the first hash of a partition key that would fall in it
+	pub fn partitions(&self) -> impl Iterator<Item = (Partition, Hash)> + '_ {
+		(0..(1 << PARTITION_BITS)).map(|i| {
+			let top = (i as u16) << (16 - PARTITION_BITS);
+			let mut location = [0u8; 32];
+			location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]);
+			(i as u16, Hash::from(location))
+		})
+	}
+
+	/// Return the n servers in which data for this hash should be replicated
+	pub fn nodes_of(&self, position: &Hash, n: usize) -> impl Iterator<Item = Uuid> + '_ {
+		assert_eq!(n, self.replication_factor);
+
+		let data = &self.ring_assignment_data;
+
+		let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) {
+			let partition_idx = self.partition_of(position) as usize;
+			let partition_start = partition_idx * self.replication_factor;
+			let partition_end = (partition_idx + 1) * self.replication_factor;
+			&data[partition_start..partition_end]
+		} else {
+			warn!("Ring not yet ready, read/writes will be lost!");
+			&[]
+		};
+
+		partition_nodes
+			.iter()
+			.map(move |i| self.node_id_vec[*i as usize])
+	}
+
+	// ===================== internal information extractors ======================
+
+	pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 {
+		self.get_node_capacity(uuid)
+			.expect("non-gateway node with zero capacity")
+	}
+
+	pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str {
+		self.get_node_zone(uuid).expect("node without a zone")
+	}
+
+	/// Returns the sum of capacities of non gateway nodes in the cluster
+	fn get_total_capacity(&self) -> u64 {
+		let mut total_capacity = 0;
+		for uuid in self.nongateway_nodes() {
+			total_capacity += self.expect_get_node_capacity(uuid);
+		}
+		total_capacity
 	}

 	/// Returns the effective value of the zone_redundancy parameter
-	fn effective_zone_redundancy(&self) -> usize {
+	pub(crate) fn effective_zone_redundancy(&self) -> usize {
 		match self.parameters.zone_redundancy {
 			ZoneRedundancy::AtLeast(v) => v,
 			ZoneRedundancy::Maximum => {
@ -465,10 +175,14 @@ To know the correct value of the new layout version, invoke `garage layout show`
 	/// (assignment, roles, parameters, partition size)
 	/// returns true if consistent, false if error
 	pub fn check(&self) -> Result<(), String> {
-		// Check that the hash of the staging data is correct
-		let staging_hash = self.calculate_staging_hash();
-		if staging_hash != self.staging_hash {
-			return Err("staging_hash is incorrect".into());
+		// Check that the assignment data has the correct length
+		let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor;
+		if self.ring_assignment_data.len() != expected_assignment_data_len {
+			return Err(format!(
+				"ring_assignment_data has incorrect length {} instead of {}",
+				self.ring_assignment_data.len(),
+				expected_assignment_data_len
+			));
 		}

 		// Check that node_id_vec contains the correct list of nodes
@ -486,16 +200,6 @@ To know the correct value of the new layout version, invoke `garage layout show`
 			return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes));
 		}

-		// Check that the assignment data has the correct length
-		let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor;
-		if self.ring_assignment_data.len() != expected_assignment_data_len {
-			return Err(format!(
-				"ring_assignment_data has incorrect length {} instead of {}",
-				self.ring_assignment_data.len(),
-				expected_assignment_data_len
-			));
-		}
-
 		// Check that the assigned nodes are correct identifiers
 		// of nodes that are assigned a role
 		// and that role is not the role of a gateway nodes
@ -524,10 +228,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
 			// Check that every partition is spread over at least zone_redundancy zones.
 			let zones_of_p = nodes_of_p
 				.iter()
-				.map(|n| {
-					self.get_node_zone(&self.node_id_vec[*n as usize])
-						.expect("Zone not found.")
-				})
+				.map(|n| self.expect_get_node_zone(&self.node_id_vec[*n as usize]))
 				.collect::<Vec<_>>();
 			if zones_of_p.iter().unique().count() < zone_redundancy {
 				return Err(format!(
@ -546,7 +247,7 @@ To know the correct value of the new layout version, invoke `garage layout show`
 			if *usage > 0 {
 				let uuid = self.node_id_vec[n];
 				let partusage = usage * self.partition_size;
-				let nodecap = self.get_node_capacity(&uuid).unwrap();
+				let nodecap = self.expect_get_node_capacity(&uuid);
 				if partusage > nodecap {
 					return Err(format!(
 						"node usage ({}) is bigger than node capacity ({})",
@ -574,12 +275,24 @@ To know the correct value of the new layout version, invoke `garage layout show`

 		Ok(())
 	}
-}

-// ====================================================================================
+	// ================== updates to layout, internals ===================
+
+	pub(crate) fn calculate_next_version(
+		mut self,
+		staging: &LayoutStaging,
+	) -> Result<(Self, Message), Error> {
+		self.version += 1;
+
+		self.roles.merge(&staging.roles);
+		self.roles.retain(|(_, _, v)| v.0.is_some());
+		self.parameters = *staging.parameters.get();
+
+		let msg = self.calculate_partition_assignment()?;
+
+		Ok((self, msg))
+	}

-// Implementation of the ClusterLayout methods related to the assignment algorithm.
-impl ClusterLayout {
 	/// This function calculates a new partition-to-node assignment.
 	/// The computed assignment respects the node replication factor
 	/// and the zone redundancy parameter It maximizes the capacity of a
@ -609,12 +322,12 @@ impl ClusterLayout {
 		// to use them as indices in the flow graphs.
 		let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?;

-		let nb_nongateway_nodes = self.nongateway_nodes().len();
-		if nb_nongateway_nodes < self.replication_factor {
+		if self.nongateway_nodes().len() < self.replication_factor {
 			return Err(Error::Message(format!(
 				"The number of nodes with positive \
            capacity ({}) is smaller than the replication factor ({}).",
-				nb_nongateway_nodes, self.replication_factor
+				self.nongateway_nodes().len(),
+				self.replication_factor
 			)));
 		}
 		if id_to_zone.len() < zone_redundancy {
@ -712,12 +425,14 @@ impl ClusterLayout {
 			.map(|(k, _, _)| *k)
 			.collect();

-		let mut new_node_id_vec = Vec::<Uuid>::new();
-		new_node_id_vec.extend(new_non_gateway_nodes);
-		new_node_id_vec.extend(new_gateway_nodes);
+		let old_node_id_vec = std::mem::take(&mut self.node_id_vec);

-		let old_node_id_vec = self.node_id_vec.clone();
-		self.node_id_vec = new_node_id_vec.clone();
+		self.nongateway_node_count = new_non_gateway_nodes.len();
+		self.node_id_vec.clear();
+		self.node_id_vec.extend(new_non_gateway_nodes);
+		self.node_id_vec.extend(new_gateway_nodes);
+
+		let new_node_id_vec = &self.node_id_vec;

 		// (2) We retrieve the old association
 		// We rewrite the old association with the new indices. We only consider partition
@ -756,7 +471,7 @@ impl ClusterLayout {
 			}
 		}

-		// We write the ring
+		// We clear the ring assignemnt data
 		self.ring_assignment_data = Vec::<CompactNodeType>::new();

 		Ok(Some(old_assignment))
@ -764,7 +479,9 @@ impl ClusterLayout {

 	/// This function generates ids for the zone of the nodes appearing in
 	/// self.node_id_vec.
-	fn generate_nongateway_zone_ids(&self) -> Result<(Vec<String>, HashMap<String, usize>), Error> {
+	pub(crate) fn generate_nongateway_zone_ids(
+		&self,
+	) -> Result<(Vec<String>, HashMap<String, usize>), Error> {
 		let mut id_to_zone = Vec::<String>::new();
 		let mut zone_to_id = HashMap::<String, usize>::new();

@ -797,7 +514,7 @@ impl ClusterLayout {
 		}

 		let mut s_down = 1;
-		let mut s_up = self.get_total_capacity()?;
+		let mut s_up = self.get_total_capacity();
 		while s_down + 1 < s_up {
 			g = self.generate_flow_graph(
 				(s_down + s_up) / 2,
@ -846,7 +563,7 @@ impl ClusterLayout {
 		zone_redundancy: usize,
 	) -> Result<Graph<FlowEdge>, Error> {
 		let vertices =
-			ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
+			LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len());
 		let mut g = Graph::<FlowEdge>::new(&vertices);
 		let nb_zones = zone_to_id.len();
 		for p in 0..NB_PARTITIONS {
@ -866,8 +583,8 @@ impl ClusterLayout {
 			}
 		}
 		for n in 0..self.nongateway_nodes().len() {
-			let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?;
-			let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?];
+			let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]);
+			let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[n])];
 			g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?;
 			for p in 0..NB_PARTITIONS {
 				if !exclude_assoc.contains(&(p, n)) {
@ -913,7 +630,7 @@ impl ClusterLayout {
 		// The algorithm is such that it will start with the flow that we just computed
 		// and find ameliorating paths from that.
 		for (p, n) in exclude_edge.iter() {
-			let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?];
+			let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])];
 			g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?;
 		}
 		g.compute_maximal_flow()?;
@ -933,7 +650,7 @@ impl ClusterLayout {
 		let mut cost = CostFunction::new();
 		for (p, assoc_p) in prev_assign.iter().enumerate() {
 			for n in assoc_p.iter() {
-				let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?];
+				let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])];
 				cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1);
 			}
 		}
@ -988,7 +705,7 @@ impl ClusterLayout {
 		let mut msg = Message::new();

 		let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64;
-		let total_cap = self.get_total_capacity()?;
+		let total_cap = self.get_total_capacity();
 		let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32);
 		msg.push(format!(
 			"Usable capacity / total cluster capacity:   {} / {} ({:.1} %)",
@ -1035,7 +752,7 @@ impl ClusterLayout {
 						let mut old_zones_of_p = Vec::<usize>::new();
 						for n in prev_assign[p].iter() {
 							old_zones_of_p
-								.push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]);
+								.push(zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]);
 						}
 						if !old_zones_of_p.contains(&z) {
 							new_partitions_zone[z] += 1;
@ -1077,7 +794,7 @@ impl ClusterLayout {
 		for z in 0..id_to_zone.len() {
 			let mut nodes_of_z = Vec::<usize>::new();
 			for n in 0..storing_nodes.len() {
-				if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] {
+				if self.expect_get_node_zone(&self.node_id_vec[n]) == id_to_zone[z] {
 					nodes_of_z.push(n);
 				}
 			}
@ -1091,13 +808,13 @@ impl ClusterLayout {
 			let available_cap_z: u64 = self.partition_size * replicated_partitions as u64;
 			let mut total_cap_z = 0;
 			for n in nodes_of_z.iter() {
-				total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?;
+				total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]);
 			}
 			let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32);

 			for n in nodes_of_z.iter() {
 				let available_cap_n = stored_partitions[*n] as u64 * self.partition_size;
-				let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?;
+				let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]);
 				let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string();
 				table.push(format!(
 					"  {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)",
@ -1127,167 +844,3 @@ impl ClusterLayout {
 		Ok(msg)
 	}
 }
-
-// ====================================================================================
-
-#[cfg(test)]
-mod tests {
-	use super::{Error, *};
-	use std::cmp::min;
-
-	// This function checks that the partition size S computed is at least better than the
-	// one given by a very naive algorithm. To do so, we try to run the naive algorithm
-	// assuming a partion size of S+1. If we succed, it means that the optimal assignment
-	// was not optimal. The naive algorithm is the following :
-	// - we compute the max number of partitions associated to every node, capped at the
-	// partition number. It gives the number of tokens of every node.
-	// - every zone has a number of tokens equal to the sum of the tokens of its nodes.
-	// - we cycle over the partitions and associate zone tokens while respecting the
-	// zone redundancy constraint.
-	// NOTE: the naive algorithm is not optimal. Counter example:
-	// take nb_partition = 3  ; replication_factor = 5; redundancy = 4;
-	// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2)
-	// With these parameters, the naive algo fails, whereas there is a solution:
-	// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E)
-	fn check_against_naive(cl: &ClusterLayout) -> Result<bool, Error> {
-		let over_size = cl.partition_size + 1;
-		let mut zone_token = HashMap::<String, usize>::new();
-
-		let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?;
-
-		if zones.is_empty() {
-			return Ok(false);
-		}
-
-		for z in zones.iter() {
-			zone_token.insert(z.clone(), 0);
-		}
-		for uuid in cl.nongateway_nodes().iter() {
-			let z = cl.get_node_zone(uuid)?;
-			let c = cl.get_node_capacity(uuid)?;
-			zone_token.insert(
-				z.clone(),
-				zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize),
-			);
-		}
-
-		// For every partition, we count the number of zone already associated and
-		// the name of the last zone associated
-
-		let mut id_zone_token = vec![0; zones.len()];
-		for (z, t) in zone_token.iter() {
-			id_zone_token[zone_to_id[z]] = *t;
-		}
-
-		let mut nb_token = vec![0; NB_PARTITIONS];
-		let mut last_zone = vec![zones.len(); NB_PARTITIONS];
-
-		let mut curr_zone = 0;
-
-		let redundancy = cl.effective_zone_redundancy();
-
-		for replic in 0..cl.replication_factor {
-			for p in 0..NB_PARTITIONS {
-				while id_zone_token[curr_zone] == 0
-					|| (last_zone[p] == curr_zone
-						&& redundancy - nb_token[p] <= cl.replication_factor - replic)
-				{
-					curr_zone += 1;
-					if curr_zone >= zones.len() {
-						return Ok(true);
-					}
-				}
-				id_zone_token[curr_zone] -= 1;
-				if last_zone[p] != curr_zone {
-					nb_token[p] += 1;
-					last_zone[p] = curr_zone;
-				}
-			}
-		}
-
-		return Ok(false);
-	}
-
-	fn show_msg(msg: &Message) {
-		for s in msg.iter() {
-			println!("{}", s);
-		}
-	}
-
-	fn update_layout(
-		cl: &mut ClusterLayout,
-		node_id_vec: &Vec<u8>,
-		node_capacity_vec: &Vec<u64>,
-		node_zone_vec: &Vec<String>,
-		zone_redundancy: usize,
-	) {
-		for i in 0..node_id_vec.len() {
-			if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) {
-				cl.node_id_vec.push(x);
-			}
-
-			let update = cl.staging_roles.update_mutator(
-				cl.node_id_vec[i],
-				NodeRoleV(Some(NodeRole {
-					zone: (node_zone_vec[i].to_string()),
-					capacity: (Some(node_capacity_vec[i])),
-					tags: (vec![]),
-				})),
-			);
-			cl.staging_roles.merge(&update);
-		}
-		cl.staging_parameters.update(LayoutParameters {
-			zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy),
-		});
-		cl.staging_hash = cl.calculate_staging_hash();
-	}
-
-	#[test]
-	fn test_assignment() {
-		let mut node_id_vec = vec![1, 2, 3];
-		let mut node_capacity_vec = vec![4000, 1000, 2000];
-		let mut node_zone_vec = vec!["A", "B", "C"]
-			.into_iter()
-			.map(|x| x.to_string())
-			.collect();
-
-		let mut cl = ClusterLayout::new(3);
-		update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3);
-		let v = cl.version;
-		let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
-		show_msg(&msg);
-		assert_eq!(cl.check(), Ok(()));
-		assert!(matches!(check_against_naive(&cl), Ok(true)));
-
-		node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
-		node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000];
-		node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"]
-			.into_iter()
-			.map(|x| x.to_string())
-			.collect();
-		update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2);
-		let v = cl.version;
-		let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
-		show_msg(&msg);
-		assert_eq!(cl.check(), Ok(()));
-		assert!(matches!(check_against_naive(&cl), Ok(true)));
-
-		node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000];
-		update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3);
-		let v = cl.version;
-		let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
-		show_msg(&msg);
-		assert_eq!(cl.check(), Ok(()));
-		assert!(matches!(check_against_naive(&cl), Ok(true)));
-
-		node_capacity_vec = vec![
-			4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000,
-		];
-		update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1);
-		let v = cl.version;
-		let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap();
-		show_msg(&msg);
-		assert_eq!(cl.check(), Ok(()));
-		assert!(matches!(check_against_naive(&cl), Ok(true)));
-	}
-}
--- a/src/rpc/lib.rs
+++ b/src/rpc/lib.rs
@ -11,10 +11,8 @@ mod consul;
 #[cfg(feature = "kubernetes-discovery")]
 mod kubernetes;

-pub mod graph_algo;
 pub mod layout;
 pub mod replication_mode;
-pub mod ring;
 pub mod system;

 pub mod rpc_helper;
--- a/src/rpc/replication_mode.rs
+++ b/src/rpc/replication_mode.rs
@ -54,4 +54,11 @@ impl ReplicationMode {
 			Self::ThreeWayDangerous => 1,
 		}
 	}
+
+	pub fn is_read_after_write_consistent(&self) -> bool {
+		match self {
+			Self::None | Self::TwoWay | Self::ThreeWay => true,
+			_ => false,
+		}
+	}
 }
--- a/src/rpc/ring.rs
+++ b/src/rpc/ring.rs
@ -1,164 +0,0 @@
-//! Module containing types related to computing nodes which should receive a copy of data blocks
-//! and metadata
-use std::convert::TryInto;
-
-use garage_util::data::*;
-
-use crate::layout::ClusterLayout;
-
-/// A partition id, which is stored on 16 bits
-/// i.e. we have up to 2**16 partitions.
-/// (in practice we have exactly 2**PARTITION_BITS partitions)
-pub type Partition = u16;
-
-// TODO: make this constant parametrizable in the config file
-// For deployments with many nodes it might make sense to bump
-// it up to 10.
-// Maximum value : 16
-/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in
-/// presence of numerous nodes, but exponentially bigger ring. Max 16
-pub const PARTITION_BITS: usize = 8;
-
-const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS);
-
-/// A ring distributing fairly objects to nodes
-#[derive(Clone)]
-pub struct Ring {
-	/// The replication factor for this ring
-	pub replication_factor: usize,
-
-	/// The network configuration used to generate this ring
-	pub layout: ClusterLayout,
-
-	// Internal order of nodes used to make a more compact representation of the ring
-	nodes: Vec<Uuid>,
-
-	// The list of entries in the ring
-	ring: Vec<RingEntry>,
-}
-
-// Type to store compactly the id of a node in the system
-// Change this to u16 the day we want to have more than 256 nodes in a cluster
-pub type CompactNodeType = u8;
-pub const MAX_NODE_NUMBER: usize = 256;
-
-// The maximum number of times an object might get replicated
-// This must be at least 3 because Garage supports 3-way replication
-// Here we use 6 so that the size of a ring entry is 8 bytes
-// (2 bytes partition id, 6 bytes node numbers as u8s)
-const MAX_REPLICATION: usize = 6;
-
-/// An entry in the ring
-#[derive(Clone, Debug)]
-struct RingEntry {
-	// The two first bytes of the first hash that goes in this partition
-	// (the next bytes are zeroes)
-	hash_prefix: u16,
-	// The nodes that store this partition, stored as a list of positions in the `nodes`
-	// field of the Ring structure
-	// Only items 0 up to ring.replication_factor - 1 are used, others are zeros
-	nodes_buf: [CompactNodeType; MAX_REPLICATION],
-}
-
-impl Ring {
-	pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self {
-		if replication_factor != layout.replication_factor {
-			warn!("Could not build ring: replication factor does not match between local configuration and network role assignment.");
-			return Self::empty(layout, replication_factor);
-		}
-
-		if layout.ring_assignment_data.len() != replication_factor * (1 << PARTITION_BITS) {
-			warn!("Could not build ring: network role assignment data has invalid length");
-			return Self::empty(layout, replication_factor);
-		}
-
-		let nodes = layout.node_id_vec.clone();
-		let ring = (0..(1 << PARTITION_BITS))
-			.map(|i| {
-				let top = (i as u16) << (16 - PARTITION_BITS);
-				let mut nodes_buf = [0u8; MAX_REPLICATION];
-				nodes_buf[..replication_factor].copy_from_slice(
-					&layout.ring_assignment_data
-						[replication_factor * i..replication_factor * (i + 1)],
-				);
-				RingEntry {
-					hash_prefix: top,
-					nodes_buf,
-				}
-			})
-			.collect::<Vec<_>>();
-
-		Self {
-			replication_factor,
-			layout,
-			nodes,
-			ring,
-		}
-	}
-
-	fn empty(layout: ClusterLayout, replication_factor: usize) -> Self {
-		Self {
-			replication_factor,
-			layout,
-			nodes: vec![],
-			ring: vec![],
-		}
-	}
-
-	/// Get the partition in which data would fall on
-	pub fn partition_of(&self, position: &Hash) -> Partition {
-		let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap());
-		top >> (16 - PARTITION_BITS)
-	}
-
-	/// Get the list of partitions and the first hash of a partition key that would fall in it
-	pub fn partitions(&self) -> Vec<(Partition, Hash)> {
-		let mut ret = vec![];
-
-		for (i, entry) in self.ring.iter().enumerate() {
-			let mut location = [0u8; 32];
-			location[..2].copy_from_slice(&u16::to_be_bytes(entry.hash_prefix)[..]);
-			ret.push((i as u16, location.into()));
-		}
-		if !ret.is_empty() {
-			assert_eq!(ret[0].1, [0u8; 32].into());
-		}
-
-		ret
-	}
-
-	/// Walk the ring to find the n servers in which data should be replicated
-	pub fn get_nodes(&self, position: &Hash, n: usize) -> Vec<Uuid> {
-		if self.ring.len() != 1 << PARTITION_BITS {
-			warn!("Ring not yet ready, read/writes will be lost!");
-			return vec![];
-		}
-
-		let partition_idx = self.partition_of(position) as usize;
-		let partition = &self.ring[partition_idx];
-
-		let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap());
-		// Check that we haven't messed up our partition table, i.e. that this partition
-		// table entrey indeed corresponds to the item we are storing
-		assert_eq!(
-			partition.hash_prefix & PARTITION_MASK_U16,
-			top & PARTITION_MASK_U16
-		);
-
-		assert!(n <= self.replication_factor);
-		partition.nodes_buf[..n]
-			.iter()
-			.map(|i| self.nodes[*i as usize])
-			.collect::<Vec<_>>()
-	}
-}
-
-#[cfg(test)]
-mod tests {
-	use super::*;
-
-	#[test]
-	fn test_ring_entry_size() {
-		assert_eq!(std::mem::size_of::<RingEntry>(), 8);
-	}
-}
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@ -1,12 +1,12 @@
 //! Contain structs related to making RPCs
-use std::sync::Arc;
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
 use std::time::Duration;

 use futures::future::join_all;
 use futures::stream::futures_unordered::FuturesUnordered;
 use futures::stream::StreamExt;
 use tokio::select;
-use tokio::sync::watch;

 use opentelemetry::KeyValue;
 use opentelemetry::{
@ -26,8 +26,8 @@ use garage_util::data::*;
 use garage_util::error::Error;
 use garage_util::metrics::RecordDuration;

+use crate::layout::{LayoutHelper, LayoutHistory};
 use crate::metrics::RpcMetrics;
-use crate::ring::Ring;

 // Default RPC timeout = 5 minutes
 const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
@ -36,11 +36,11 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
 #[derive(Copy, Clone)]
 pub struct RequestStrategy {
 	/// Min number of response to consider the request successful
-	pub rs_quorum: Option<usize>,
-	/// Should requests be dropped after enough response are received
-	pub rs_interrupt_after_quorum: bool,
+	rs_quorum: Option<usize>,
+	/// Send all requests at once
+	rs_send_all_at_once: Option<bool>,
 	/// Request priority
-	pub rs_priority: RequestPriority,
+	rs_priority: RequestPriority,
 	/// Custom timeout for this request
 	rs_timeout: Timeout,
 }
@ -57,7 +57,7 @@ impl RequestStrategy {
 	pub fn with_priority(prio: RequestPriority) -> Self {
 		RequestStrategy {
 			rs_quorum: None,
-			rs_interrupt_after_quorum: false,
+			rs_send_all_at_once: None,
 			rs_priority: prio,
 			rs_timeout: Timeout::Default,
 		}
@ -67,10 +67,9 @@ impl RequestStrategy {
 		self.rs_quorum = Some(quorum);
 		self
 	}
-	/// Set if requests can be dropped after quorum has been reached
-	/// In general true for read requests, and false for write
-	pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self {
-		self.rs_interrupt_after_quorum = interrupt;
+	/// Set quorum to be reached for request
+	pub fn send_all_at_once(mut self, value: bool) -> Self {
+		self.rs_send_all_at_once = Some(value);
 		self
 	}
 	/// Deactivate timeout for this request
@ -91,7 +90,7 @@ pub struct RpcHelper(Arc<RpcHelperInner>);
 struct RpcHelperInner {
 	our_node_id: Uuid,
 	fullmesh: Arc<FullMeshPeeringStrategy>,
-	ring: watch::Receiver<Arc<Ring>>,
+	layout: Arc<RwLock<LayoutHelper>>,
 	metrics: RpcMetrics,
 	rpc_timeout: Duration,
 }
@ -100,7 +99,7 @@ impl RpcHelper {
 	pub(crate) fn new(
 		our_node_id: Uuid,
 		fullmesh: Arc<FullMeshPeeringStrategy>,
-		ring: watch::Receiver<Arc<Ring>>,
+		layout: Arc<RwLock<LayoutHelper>>,
 		rpc_timeout: Option<Duration>,
 	) -> Self {
 		let metrics = RpcMetrics::new();
@ -108,7 +107,7 @@ impl RpcHelper {
 		Self(Arc::new(RpcHelperInner {
 			our_node_id,
 			fullmesh,
-			ring,
+			layout,
 			metrics,
 			rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
 		}))
@ -130,6 +129,12 @@ impl RpcHelper {
 		N: IntoReq<M> + Send,
 		H: StreamingEndpointHandler<M>,
 	{
+		let tracer = opentelemetry::global::tracer("garage");
+		let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to);
+		let mut span = tracer.start(span_name);
+		span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
+		span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
+
 		let metric_tags = [
 			KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
 			KeyValue::new("from", format!("{:?}", self.0.our_node_id)),
@ -141,6 +146,7 @@ impl RpcHelper {
 		let node_id = to.into();
 		let rpc_call = endpoint
 			.call_streaming(&node_id, msg, strat.rs_priority)
+			.with_context(Context::current_with_span(span))
 			.record_duration(&self.0.metrics.rpc_duration, &metric_tags);

 		let timeout = async {
@ -183,12 +189,17 @@ impl RpcHelper {
 		N: IntoReq<M>,
 		H: StreamingEndpointHandler<M>,
 	{
+		let tracer = opentelemetry::global::tracer("garage");
+		let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len());
+		let span = tracer.start(span_name);
+
 		let msg = msg.into_req().map_err(netapp::error::Error::from)?;

 		let resps = join_all(
 			to.iter()
 				.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
 		)
+		.with_context(Context::current_with_span(span))
 		.await;
 		Ok(to
 			.iter()
@ -220,6 +231,22 @@ impl RpcHelper {

 	/// Make a RPC call to multiple servers, returning either a Vec of responses,
 	/// or an error if quorum could not be reached due to too many errors
+	///
+	/// If RequestStrategy has send_all_at_once set, then all requests will be
+	/// sent at once, and `try_call_many` will return as soon as a quorum of
+	/// responses is achieved, dropping and cancelling the remaining requests.
+	///
+	/// Otherwise, `quorum` requests will be sent at the same time, and if an
+	/// error response is received, a new request will be sent to replace it.
+	/// The ordering of nodes to which requests are sent is determined by
+	/// the `RpcHelper::request_order` function, which takes into account
+	/// parameters such as node zones and measured ping values.
+	///
+	/// In both cases, the basic contract of this function is that even in the
+	/// absence of failures, the RPC call might not be driven to completion
+	/// on all of the specified nodes. It is therefore unfit for broadcast
+	/// write operations where we expect all nodes to successfully store
+	/// the written date.
 	pub async fn try_call_many<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
@ -236,31 +263,24 @@ impl RpcHelper {
 		let quorum = strategy.rs_quorum.unwrap_or(to.len());

 		let tracer = opentelemetry::global::tracer("garage");
-		let span_name = if strategy.rs_interrupt_after_quorum {
-			format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len())
-		} else {
-			format!(
-				"RPC {} to {} (quorum {})",
+		let span_name = format!(
+			"RPC [{}] try_call_many (quorum {}/{})",
 			endpoint.path(),
-				to.len(),
-				quorum
-			)
-		};
+			quorum,
+			to.len()
+		);
+
 		let mut span = tracer.start(span_name);
 		span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
 		span.set_attribute(KeyValue::new("to", format!("{:?}", to)));
 		span.set_attribute(KeyValue::new("quorum", quorum as i64));
-		span.set_attribute(KeyValue::new(
-			"interrupt_after_quorum",
-			strategy.rs_interrupt_after_quorum.to_string(),
-		));

-		self.try_call_many_internal(endpoint, to, msg, strategy, quorum)
+		self.try_call_many_inner(endpoint, to, msg, strategy, quorum)
 			.with_context(Context::current_with_span(span))
 			.await
 	}

-	async fn try_call_many_internal<M, N, H, S>(
+	async fn try_call_many_inner<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
 		to: &[Uuid],
@ -274,129 +294,238 @@ impl RpcHelper {
 		H: StreamingEndpointHandler<M> + 'static,
 		S: Send + 'static,
 	{
-		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
+		// Once quorum is reached, other requests don't matter.
+		// What we do here is only send the required number of requests
+		// to reach a quorum, priorizing nodes with the lowest latency.
+		// When there are errors, we start new requests to compensate.
+
+		// TODO: this could be made more aggressive, e.g. if after 2x the
+		// average ping of a given request, the response is not yet received,
+		// preemptively send an additional request to any remaining nodes.
+
+		// Reorder requests to priorize closeness / low latency
+		let request_order = self.request_order(&self.0.layout.read().unwrap(), to.iter().copied());
+		let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false);

 		// Build future for each request
 		// They are not started now: they are added below in a FuturesUnordered
 		// object that will take care of polling them (see below)
-		let requests = to.iter().cloned().map(|to| {
+		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
+		let mut requests = request_order.into_iter().map(|to| {
 			let self2 = self.clone();
 			let msg = msg.clone();
 			let endpoint2 = endpoint.clone();
-			(to, async move {
-				self2.call(&endpoint2, to, msg, strategy).await
-			})
+			async move { self2.call(&endpoint2, to, msg, strategy).await }
 		});

 		// Vectors in which success results and errors will be collected
 		let mut successes = vec![];
 		let mut errors = vec![];

-		if strategy.rs_interrupt_after_quorum {
-			// Case 1: once quorum is reached, other requests don't matter.
-			// What we do here is only send the required number of requests
-			// to reach a quorum, priorizing nodes with the lowest latency.
-			// When there are errors, we start new requests to compensate.
-
-			// Reorder requests to priorize closeness / low latency
-			let request_order = self.request_order(to);
-			let mut ord_requests = vec![(); request_order.len()]
-				.into_iter()
-				.map(|_| None)
-				.collect::<Vec<_>>();
-			for (to, fut) in requests {
-				let i = request_order.iter().position(|x| *x == to).unwrap();
-				ord_requests[i] = Some((to, fut));
-			}
-
-			// Make an iterator to take requests in their sorted order
-			let mut requests = ord_requests.into_iter().map(Option::unwrap);
-
 		// resp_stream will contain all of the requests that are currently in flight.
 		// (for the moment none, they will be added in the loop below)
 		let mut resp_stream = FuturesUnordered::new();

 		// Do some requests and collect results
-			'request_loop: while successes.len() < quorum {
+		while successes.len() < quorum {
 			// If the current set of requests that are running is not enough to possibly
 			// reach quorum, start some new requests.
-				while successes.len() + resp_stream.len() < quorum {
-					if let Some((req_to, fut)) = requests.next() {
-						let tracer = opentelemetry::global::tracer("garage");
-						let span = tracer.start(format!("RPC to {:?}", req_to));
-						resp_stream.push(tokio::spawn(
-							fut.with_context(Context::current_with_span(span)),
-						));
+			while send_all_at_once || successes.len() + resp_stream.len() < quorum {
+				if let Some(fut) = requests.next() {
+					resp_stream.push(fut)
 				} else {
-						// If we have no request to add, we know that we won't ever
-						// reach quorum: bail out now.
-						break 'request_loop;
-					}
-				}
-				assert!(!resp_stream.is_empty()); // because of loop invariants
-
-				// Wait for one request to terminate
-				match resp_stream.next().await.unwrap().unwrap() {
-					Ok(msg) => {
-						successes.push(msg);
-					}
-					Err(e) => {
-						errors.push(e);
-					}
-				}
-			}
-		} else {
-			// Case 2: all of the requests need to be sent in all cases,
-			// and need to terminate. (this is the case for writes that
-			// must be spread to n nodes)
-			// Just start all the requests in parallel and return as soon
-			// as the quorum is reached.
-			let mut resp_stream = requests
-				.map(|(_, fut)| fut)
-				.collect::<FuturesUnordered<_>>();
-
-			while let Some(resp) = resp_stream.next().await {
-				match resp {
-					Ok(msg) => {
-						successes.push(msg);
-						if successes.len() >= quorum {
 					break;
 				}
 			}
+
+			if successes.len() + resp_stream.len() < quorum {
+				// We know we won't ever reach quorum
+				break;
+			}
+
+			// Wait for one request to terminate
+			match resp_stream.next().await.unwrap() {
+				Ok(msg) => {
+					successes.push(msg);
+				}
 				Err(e) => {
 					errors.push(e);
 				}
 			}
 		}

-			if !resp_stream.is_empty() {
-				// Continue remaining requests in background.
-				// Note: these requests can get interrupted on process shutdown,
-				// we must not count on them being executed for certain.
-				// For all background things that have to happen with certainty,
-				// they have to be put in a proper queue that is persisted to disk.
-				tokio::spawn(async move {
-					resp_stream.collect::<Vec<Result<_, _>>>().await;
-				});
-			}
-		}
-
 		if successes.len() >= quorum {
 			Ok(successes)
 		} else {
 			let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
-			Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
+			Err(Error::Quorum(
+				quorum,
+				None,
+				successes.len(),
+				to.len(),
+				errors,
+			))
 		}
 	}

-	pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> {
+	/// Make a RPC call to multiple servers, returning either a Vec of responses,
+	/// or an error if quorum could not be reached due to too many errors
+	///
+	/// Contrary to try_call_many, this fuction is especially made for broadcast
+	/// write operations. In particular:
+	///
+	/// - The request are sent to all specified nodes as soon as `try_write_many_sets`
+	///   is invoked.
+	///
+	/// - When `try_write_many_sets` returns, all remaining requests that haven't
+	///   completed move to a background task so that they have a chance to
+	///   complete successfully if there are no failures.
+	///
+	/// In addition, the nodes to which requests should be sent are divided in
+	/// "quorum sets", and `try_write_many_sets` only returns once a quorum
+	/// has been validated in each set. This is used in the case of cluster layout
+	/// changes, where data has to be written both in the old layout and in the
+	/// new one as long as all nodes have not successfully tranisitionned and
+	/// moved all data to the new layout.
+	pub async fn try_write_many_sets<M, N, H, S>(
+		&self,
+		endpoint: &Arc<Endpoint<M, H>>,
+		to_sets: &[Vec<Uuid>],
+		msg: N,
+		strategy: RequestStrategy,
+	) -> Result<Vec<S>, Error>
+	where
+		M: Rpc<Response = Result<S, Error>> + 'static,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M> + 'static,
+		S: Send + 'static,
+	{
+		let quorum = strategy
+			.rs_quorum
+			.expect("internal error: missing quorum value in try_write_many_sets");
+
+		let tracer = opentelemetry::global::tracer("garage");
+		let span_name = format!(
+			"RPC [{}] try_write_many_sets (quorum {} in {} sets)",
+			endpoint.path(),
+			quorum,
+			to_sets.len()
+		);
+
+		let mut span = tracer.start(span_name);
+		span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id)));
+		span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets)));
+		span.set_attribute(KeyValue::new("quorum", quorum as i64));
+
+		self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum)
+			.with_context(Context::current_with_span(span))
+			.await
+	}
+
+	async fn try_write_many_sets_inner<M, N, H, S>(
+		&self,
+		endpoint: &Arc<Endpoint<M, H>>,
+		to_sets: &[Vec<Uuid>],
+		msg: N,
+		strategy: RequestStrategy,
+		quorum: usize,
+	) -> Result<Vec<S>, Error>
+	where
+		M: Rpc<Response = Result<S, Error>> + 'static,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M> + 'static,
+		S: Send + 'static,
+	{
+		// Peers may appear in many quorum sets. Here, build a list of peers,
+		// mapping to the index of the quorum sets in which they appear.
+		let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum);
+
+		// Send one request to each peer of the quorum sets
+		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
+		let requests = result_tracker.nodes.keys().map(|peer| {
+			let self2 = self.clone();
+			let msg = msg.clone();
+			let endpoint2 = endpoint.clone();
+			let to = *peer;
+			async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }
+		});
+		let mut resp_stream = requests.collect::<FuturesUnordered<_>>();
+
+		// Drive requests to completion
+		while let Some((node, resp)) = resp_stream.next().await {
+			// Store the response in the correct vector and increment the
+			// appropriate counters
+			result_tracker.register_result(node, resp);
+
+			// If we have a quorum of ok in all quorum sets, then it's a success!
+			if result_tracker.all_quorums_ok() {
+				// Continue all other requets in background
+				tokio::spawn(async move {
+					resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await;
+				});
+
+				return Ok(result_tracker.success_values());
+			}
+
+			// If there is a quorum set for which too many errors were received,
+			// we know it's impossible to get a quorum, so return immediately.
+			if result_tracker.too_many_failures() {
+				break;
+			}
+		}
+
+		// At this point, there is no quorum and we know that a quorum
+		// will never be achieved. Currently, we drop all remaining requests.
+		// Should we still move them to background so that they can continue
+		// for non-failed nodes? Not doing so has no impact on correctness,
+		// but it means that more cancellation messages will be sent. Idk.
+		// (When an in-progress request future is dropped, Netapp automatically
+		// sends a cancellation message to the remote node to inform it that
+		// the result is no longer needed. In turn, if the remote node receives
+		// the cancellation message in time, it interrupts the task of the
+		// running request handler.)
+
+		// Failure, could not get quorum
+		Err(result_tracker.quorum_error())
+	}
+
+	// ---- functions not related to MAKING RPCs, but just determining to what nodes
+	//      they should be made and in which order ----
+
+	pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec<Uuid> {
+		let layout = self.0.layout.read().unwrap();
+
+		let mut ret = Vec::with_capacity(12);
+		let ver_iter = layout
+			.versions
+			.iter()
+			.rev()
+			.chain(layout.old_versions.iter().rev());
+		for ver in ver_iter {
+			if ver.version > layout.sync_map_min() {
+				continue;
+			}
+			let nodes = ver.nodes_of(position, ver.replication_factor);
+			for node in rpc_helper.request_order(&layout, nodes) {
+				if !ret.contains(&node) {
+					ret.push(node);
+				}
+			}
+		}
+		ret
+	}
+
+	fn request_order(
+		&self,
+		layout: &LayoutHistory,
+		nodes: impl Iterator<Item = Uuid>,
+	) -> Vec<Uuid> {
 		// Retrieve some status variables that we will use to sort requests
 		let peer_list = self.0.fullmesh.get_peer_list();
-		let ring: Arc<Ring> = self.0.ring.borrow().clone();
-		let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
-			Some(pc) => &pc.zone,
-			None => "",
-		};
+		let our_zone = layout
+			.current()
+			.get_node_zone(&self.0.our_node_id)
+			.unwrap_or("");

 		// Augment requests with some information used to sort them.
 		// The tuples are as follows:
@ -405,22 +534,18 @@ impl RpcHelper {
 		// By sorting this vec, we priorize ourself, then nodes in the same zone,
 		// and within a same zone we priorize nodes with the lowest latency.
 		let mut nodes = nodes
-			.iter()
 			.map(|to| {
-				let peer_zone = match ring.layout.node_role(to) {
-					Some(pc) => &pc.zone,
-					None => "",
-				};
+				let peer_zone = layout.current().get_node_zone(&to).unwrap_or("");
 				let peer_avg_ping = peer_list
 					.iter()
 					.find(|x| x.id.as_ref() == to.as_slice())
 					.and_then(|pi| pi.avg_ping)
 					.unwrap_or_else(|| Duration::from_secs(10));
 				(
-					*to != self.0.our_node_id,
+					to != self.0.our_node_id,
 					peer_zone != our_zone,
 					peer_avg_ping,
-					*to,
+					to,
 				)
 			})
 			.collect::<Vec<_>>();
@ -434,3 +559,108 @@ impl RpcHelper {
 			.collect::<Vec<_>>()
 	}
 }
+
+// ------- utility for tracking successes/errors among write sets --------
+
+pub struct QuorumSetResultTracker<S, E> {
+	/// The set of nodes and the index of the quorum sets they belong to
+	pub nodes: HashMap<Uuid, Vec<usize>>,
+	/// The quorum value, i.e. number of success responses to await in each set
+	pub quorum: usize,
+
+	/// The success responses received
+	pub successes: Vec<(Uuid, S)>,
+	/// The error responses received
+	pub failures: Vec<(Uuid, E)>,
+
+	/// The counters for successes in each set
+	pub success_counters: Box<[usize]>,
+	/// The counters for failures in each set
+	pub failure_counters: Box<[usize]>,
+	/// The total number of nodes in each set
+	pub set_lens: Box<[usize]>,
+}
+
+impl<S, E> QuorumSetResultTracker<S, E>
+where
+	E: std::fmt::Display,
+{
+	pub fn new<A>(sets: &[A], quorum: usize) -> Self
+	where
+		A: AsRef<[Uuid]>,
+	{
+		let mut nodes = HashMap::<Uuid, Vec<usize>>::new();
+		for (i, set) in sets.iter().enumerate() {
+			for node in set.as_ref().iter() {
+				nodes.entry(*node).or_default().push(i);
+			}
+		}
+
+		let num_nodes = nodes.len();
+		Self {
+			nodes,
+			quorum,
+			successes: Vec::with_capacity(num_nodes),
+			failures: vec![],
+			success_counters: vec![0; sets.len()].into_boxed_slice(),
+			failure_counters: vec![0; sets.len()].into_boxed_slice(),
+			set_lens: sets
+				.iter()
+				.map(|x| x.as_ref().len())
+				.collect::<Vec<_>>()
+				.into_boxed_slice(),
+		}
+	}
+
+	pub fn register_result(&mut self, node: Uuid, result: Result<S, E>) {
+		match result {
+			Ok(s) => {
+				self.successes.push((node, s));
+				for set in self.nodes.get(&node).unwrap().iter() {
+					self.success_counters[*set] += 1;
+				}
+			}
+			Err(e) => {
+				self.failures.push((node, e));
+				for set in self.nodes.get(&node).unwrap().iter() {
+					self.failure_counters[*set] += 1;
+				}
+			}
+		}
+	}
+
+	pub fn all_quorums_ok(&self) -> bool {
+		self.success_counters
+			.iter()
+			.all(|ok_cnt| *ok_cnt >= self.quorum)
+	}
+
+	pub fn too_many_failures(&self) -> bool {
+		self.failure_counters
+			.iter()
+			.zip(self.set_lens.iter())
+			.any(|(err_cnt, set_len)| *err_cnt + self.quorum > *set_len)
+	}
+
+	pub fn success_values(self) -> Vec<S> {
+		self.successes
+			.into_iter()
+			.map(|(_, x)| x)
+			.collect::<Vec<_>>()
+	}
+
+	pub fn quorum_error(self) -> Error {
+		let errors = self
+			.failures
+			.iter()
+			.map(|(n, e)| format!("{:?}: {}", n, e))
+			.collect::<Vec<_>>();
+		Error::Quorum(
+			self.quorum,
+			Some(self.set_lens.len()),
+			self.successes.len(),
+			self.nodes.len(),
+			errors,
+		)
+	}
+}
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@ -1,9 +1,10 @@
 //! Module containing structs related to membership management
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::io::{Read, Write};
 use std::net::{IpAddr, SocketAddr};
 use std::path::{Path, PathBuf};
-use std::sync::{Arc, RwLock};
+use std::sync::atomic::Ordering;
+use std::sync::{Arc, RwLock, RwLockReadGuard};
 use std::time::{Duration, Instant};

 use arc_swap::ArcSwap;
@ -12,8 +13,7 @@ use futures::join;
 use serde::{Deserialize, Serialize};
 use sodiumoxide::crypto::sign::ed25519;
 use tokio::select;
-use tokio::sync::watch;
-use tokio::sync::Mutex;
+use tokio::sync::{watch, Notify};

 use netapp::endpoint::{Endpoint, EndpointHandler};
 use netapp::message::*;
@ -33,9 +33,10 @@ use garage_util::time::*;
 use crate::consul::ConsulDiscovery;
 #[cfg(feature = "kubernetes-discovery")]
 use crate::kubernetes::*;
-use crate::layout::*;
+use crate::layout::{
+	self, manager::LayoutManager, LayoutHelper, LayoutHistory, NodeRoleV, RpcLayoutDigest,
+};
 use crate::replication_mode::*;
-use crate::ring::*;
 use crate::rpc_helper::*;

 use crate::system_metrics::*;
@ -46,10 +47,10 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
 /// Version tag used for version check upon Netapp connection.
 /// Cluster nodes with different version tags are deemed
 /// incompatible and will refuse to connect.
-pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008
+pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A

 /// RPC endpoint used for calls related to membership
-pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";
+pub const SYSTEM_RPC_PATH: &str = "garage_rpc/system.rs/SystemRpc";

 /// RPC messages related to membership
 #[derive(Debug, Serialize, Deserialize, Clone)]
@ -58,17 +59,22 @@ pub enum SystemRpc {
 	Ok,
 	/// Request to connect to a specific node (in <pubkey>@<host>:<port> format)
 	Connect(String),
-	/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
-	PullClusterLayout,
 	/// Advertise Garage status. Answered with another AdvertiseStatus.
 	/// Exchanged with every node on a regular basis.
 	AdvertiseStatus(NodeStatus),
-	/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
-	AdvertiseClusterLayout(ClusterLayout),
 	/// Get known nodes states
 	GetKnownNodes,
 	/// Return known nodes
 	ReturnKnownNodes(Vec<KnownNodeInfo>),
+
+	/// Ask other node its cluster layout. Answered with AdvertiseClusterLayout
+	PullClusterLayout,
+	/// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout
+	AdvertiseClusterLayout(LayoutHistory),
+	/// Ask other node its cluster layout update trackers.
+	PullClusterLayoutTrackers,
+	/// Advertisement of cluster layout update trackers.
+	AdvertiseClusterLayoutTrackers(layout::UpdateTrackers),
 }

 impl Rpc for SystemRpc {
@ -84,7 +90,6 @@ pub struct System {
 	/// The id of this node
 	pub id: Uuid,

-	persist_cluster_layout: Persister<ClusterLayout>,
 	persist_peer_list: Persister<PeerList>,

 	local_status: ArcSwap<NodeStatus>,
@ -92,9 +97,8 @@ pub struct System {

 	pub netapp: Arc<NetApp>,
 	fullmesh: Arc<FullMeshPeeringStrategy>,
-	pub rpc: RpcHelper,

-	system_endpoint: Arc<Endpoint<SystemRpc, System>>,
+	pub(crate) system_endpoint: Arc<Endpoint<SystemRpc, System>>,

 	rpc_listen_addr: SocketAddr,
 	#[cfg(any(feature = "consul-discovery", feature = "kubernetes-discovery"))]
@ -106,15 +110,13 @@ pub struct System {
 	#[cfg(feature = "kubernetes-discovery")]
 	kubernetes_discovery: Option<KubernetesDiscoveryConfig>,

+	pub layout_manager: Arc<LayoutManager>,
+
 	metrics: SystemMetrics,

 	replication_mode: ReplicationMode,
 	replication_factor: usize,

-	/// The ring
-	pub ring: watch::Receiver<Arc<Ring>>,
-	update_ring: Mutex<watch::Sender<Arc<Ring>>>,
-
 	/// Path to metadata directory
 	pub metadata_dir: PathBuf,
 	/// Path to data directory
@ -124,14 +126,13 @@ pub struct System {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct NodeStatus {
 	/// Hostname of the node
-	pub hostname: String,
+	pub hostname: Option<String>,

 	/// Replication factor configured on the node
 	pub replication_factor: usize,
-	/// Cluster layout version
-	pub cluster_layout_version: u64,
-	/// Hash of cluster layout staging data
-	pub cluster_layout_staging_hash: Hash,
+
+	/// Cluster layout digest
+	pub layout_digest: RpcLayoutDigest,

 	/// Disk usage on partition containing metadata directory (tuple: `(avail, total)`)
 	#[serde(default)]
@ -197,20 +198,6 @@ pub fn read_node_id(metadata_dir: &Path) -> Result<NodeID, Error> {
 	Ok(NodeID::from_slice(&key[..]).unwrap())
 }

-#[cfg(not(windows))]
-fn set_private_key_perms(path: &Path) -> Result<(), Error> {
-	use std::os::unix::fs::PermissionsExt;
-	let perm = std::fs::Permissions::from_mode(0o600);
-	std::fs::set_permissions(path, perm)?;
-	Ok(())
-}
-
-#[cfg(windows)]
-fn set_private_key_perms(_path: &Path) -> Result<(), Error> {
-	// TODO(mediocregopher) figure out how to do this, but it's not strictly necessary
-	Ok(())
-}
-
 pub fn gen_node_key(metadata_dir: &Path) -> Result<NodeKey, Error> {
 	let mut key_file = metadata_dir.to_path_buf();
 	key_file.push("node_key");
@ -235,8 +222,11 @@ pub fn gen_node_key(metadata_dir: &Path) -> Result<NodeKey, Error> {
 		let (pubkey, key) = ed25519::gen_keypair();

 		{
+			use std::os::unix::fs::PermissionsExt;
 			let mut f = std::fs::File::create(key_file.as_path())?;
-			set_private_key_perms(key_file.as_path())?;
+			let mut perm = f.metadata()?.permissions();
+			perm.set_mode(0o600);
+			std::fs::set_permissions(key_file.as_path(), perm)?;
 			f.write_all(&key[..])?;
 		}

@ -258,8 +248,7 @@ impl System {
 		replication_mode: ReplicationMode,
 		config: &Config,
 	) -> Result<Arc<Self>, Error> {
-		let replication_factor = replication_mode.replication_factor();
-
+		// ---- setup netapp RPC protocol ----
 		let node_key =
 			gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID");
 		info!(
@ -267,82 +256,40 @@ impl System {
 			hex::encode(&node_key.public_key()[..8])
 		);

-		let persist_cluster_layout: Persister<ClusterLayout> =
-			Persister::new(&config.metadata_dir, "cluster_layout");
-		let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
+		let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key);
+		let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());

-		let cluster_layout = match persist_cluster_layout.load() {
-			Ok(x) => {
-				if x.replication_factor != replication_factor {
-					return Err(Error::Message(format!(
-						"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
-						x.replication_factor,
-						replication_factor
-					)));
-				}
-				x
-			}
-			Err(e) => {
-				info!(
-					"No valid previous cluster layout stored ({}), starting fresh.",
-					e
-				);
-				ClusterLayout::new(replication_factor)
-			}
-		};
-
-		let metrics = SystemMetrics::new(replication_factor);
-
-		let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout);
-		local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics);
-
-		let ring = Ring::new(cluster_layout, replication_factor);
-		let (update_ring, ring) = watch::channel(Arc::new(ring));
-
-		let rpc_public_addr = match &config.rpc_public_addr {
-			Some(a_str) => {
-				use std::net::ToSocketAddrs;
-				match a_str.to_socket_addrs() {
-					Err(e) => {
-						error!(
-							"Cannot resolve rpc_public_addr {} from config file: {}.",
-							a_str, e
-						);
-						None
-					}
-					Ok(a) => {
-						let a = a.collect::<Vec<_>>();
-						if a.is_empty() {
-							error!("rpc_public_addr {} resolve to no known IP address", a_str);
-						}
-						if a.len() > 1 {
-							warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a);
-						}
-						a.into_iter().next()
-					}
-				}
-			}
-			None => {
-				let addr =
-					get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
-				if let Some(a) = addr {
-					warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a);
-				}
-				addr
-			}
-		};
+		// ---- setup netapp public listener and full mesh peering strategy ----
+		let rpc_public_addr = get_rpc_public_addr(config);
 		if rpc_public_addr.is_none() {
 			warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication.");
 		}

-		let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key);
 		let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr);
 		if let Some(ping_timeout) = config.rpc_ping_timeout_msec {
 			fullmesh.set_ping_timeout_millis(ping_timeout);
 		}

-		let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());
+		let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");

+		// ---- setup cluster layout and layout manager ----
+		let replication_factor = replication_mode.replication_factor();
+
+		let layout_manager = LayoutManager::new(
+			config,
+			netapp.id,
+			system_endpoint.clone(),
+			fullmesh.clone(),
+			replication_mode,
+		)?;
+
+		// ---- set up metrics and status exchange ----
+		let metrics = SystemMetrics::new(replication_factor);
+
+		let mut local_status = NodeStatus::initial(replication_factor, &layout_manager);
+		local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics);
+
+		// ---- if enabled, set up additionnal peer discovery methods ----
 		#[cfg(feature = "consul-discovery")]
 		let consul_discovery = match &config.consul_discovery {
 			Some(cfg) => Some(
@ -361,20 +308,14 @@ impl System {
 			warn!("Kubernetes discovery is not enabled in this build.");
 		}

+		// ---- done ----
 		let sys = Arc::new(System {
 			id: netapp.id.into(),
-			persist_cluster_layout,
 			persist_peer_list,
 			local_status: ArcSwap::new(Arc::new(local_status)),
 			node_status: RwLock::new(HashMap::new()),
 			netapp: netapp.clone(),
-			fullmesh: fullmesh.clone(),
-			rpc: RpcHelper::new(
-				netapp.id.into(),
 			fullmesh,
-				ring.clone(),
-				config.rpc_timeout_msec.map(Duration::from_millis),
-			),
 			system_endpoint,
 			replication_mode,
 			replication_factor,
@ -386,10 +327,9 @@ impl System {
 			consul_discovery,
 			#[cfg(feature = "kubernetes-discovery")]
 			kubernetes_discovery: config.kubernetes_discovery.clone(),
+			layout_manager,
 			metrics,

-			ring,
-			update_ring: Mutex::new(update_ring),
 			metadata_dir: config.metadata_dir.clone(),
 			data_dir: config.data_dir.clone(),
 		});
@ -409,6 +349,20 @@ impl System {
 		);
 	}

+	// ---- Public utilities / accessors ----
+
+	pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHelper> {
+		self.layout_manager.layout()
+	}
+
+	pub fn layout_notify(&self) -> Arc<Notify> {
+		self.layout_manager.change_notify.clone()
+	}
+
+	pub fn rpc_helper(&self) -> &RpcHelper {
+		&self.layout_manager.rpc_helper
+	}
+
 	// ---- Administrative operations (directly available and
 	//      also available through RPC) ----

@ -435,18 +389,6 @@ impl System {
 		known_nodes
 	}

-	pub fn get_cluster_layout(&self) -> ClusterLayout {
-		self.ring.borrow().layout.clone()
-	}
-
-	pub async fn update_cluster_layout(
-		self: &Arc<Self>,
-		layout: &ClusterLayout,
-	) -> Result<(), Error> {
-		self.handle_advertise_cluster_layout(layout).await?;
-		Ok(())
-	}
-
 	pub async fn connect(&self, node: &str) -> Result<(), Error> {
 		let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node)
 			.await
@ -476,47 +418,63 @@ impl System {
 	}

 	pub fn health(&self) -> ClusterHealth {
-		let ring: Arc<_> = self.ring.borrow().clone();
 		let quorum = self.replication_mode.write_quorum();
-		let replication_factor = self.replication_factor;

+		// Gather information about running nodes.
+		// Technically, `nodes` contains currently running nodes, as well
+		// as nodes that this Garage process has been connected to at least
+		// once since it started.
 		let nodes = self
 			.get_known_nodes()
 			.into_iter()
 			.map(|n| (n.id, n))
 			.collect::<HashMap<Uuid, _>>();
 		let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count();
+		let node_up = |x: &Uuid| nodes.get(x).map(|n| n.is_up).unwrap_or(false);

-		let storage_nodes = ring
-			.layout
-			.roles
+		// Acquire a rwlock read-lock to the current cluster layout
+		let layout = self.cluster_layout();
+
+		// Obtain information about nodes that have a role as storage nodes
+		// in one of the active layout versions
+		let mut storage_nodes = HashSet::<Uuid>::with_capacity(16);
+		for ver in layout.versions.iter() {
+			storage_nodes.extend(
+				ver.roles
 					.items()
 					.iter()
 					.filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some()))
-			.collect::<Vec<_>>();
-		let storage_nodes_ok = storage_nodes
-			.iter()
-			.filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false))
-			.count();
+					.map(|(n, _, _)| *n),
+			)
+		}
+		let storage_nodes_ok = storage_nodes.iter().filter(|x| node_up(x)).count();

-		let partitions = ring.partitions();
-		let partitions_n_up = partitions
+		// Determine the number of partitions that have:
+		// - a quorum of up nodes for all write sets (i.e. are available)
+		// - for which all nodes in all write sets are up (i.e. are fully healthy)
+		let partitions = layout.current().partitions().collect::<Vec<_>>();
+		let mut partitions_quorum = 0;
+		let mut partitions_all_ok = 0;
+		for (_, hash) in partitions.iter() {
+			let mut write_sets = layout
+				.versions
 				.iter()
-			.map(|(_, h)| {
-				let pn = ring.get_nodes(h, ring.replication_factor);
-				pn.iter()
-					.filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false))
-					.count()
-			})
-			.collect::<Vec<usize>>();
-		let partitions_all_ok = partitions_n_up
-			.iter()
-			.filter(|c| **c == replication_factor)
-			.count();
-		let partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count();
+				.map(|x| x.nodes_of(hash, x.replication_factor));
+			let has_quorum = write_sets
+				.clone()
+				.all(|set| set.filter(|x| node_up(x)).count() >= quorum);
+			let all_ok = write_sets.all(|mut set| set.all(|x| node_up(&x)));
+			if has_quorum {
+				partitions_quorum += 1;
+			}
+			if all_ok {
+				partitions_all_ok += 1;
+			}
+		}

+		// Determine overall cluster status
 		let status =
-			if partitions_quorum == partitions.len() && storage_nodes_ok == storage_nodes.len() {
+			if partitions_all_ok == partitions.len() && storage_nodes_ok == storage_nodes.len() {
 				ClusterHealthStatus::Healthy
 			} else if partitions_quorum == partitions.len() {
 				ClusterHealthStatus::Degraded
@ -556,7 +514,7 @@ impl System {
 		if let Err(e) = c
 			.publish_consul_service(
 				self.netapp.id,
-				&self.local_status.load_full().hostname,
+				&self.local_status.load_full().hostname.as_deref().unwrap(),
 				rpc_public_addr,
 			)
 			.await
@ -583,7 +541,7 @@ impl System {
 		if let Err(e) = publish_kubernetes_node(
 			k,
 			self.netapp.id,
-			&self.local_status.load_full().hostname,
+			&self.local_status.load_full().hostname.as_deref().unwrap(),
 			rpc_public_addr,
 		)
 		.await
@ -592,22 +550,10 @@ impl System {
 		}
 	}

-	/// Save network configuration to disc
-	async fn save_cluster_layout(&self) -> Result<(), Error> {
-		let ring: Arc<Ring> = self.ring.borrow().clone();
-		self.persist_cluster_layout
-			.save_async(&ring.layout)
-			.await
-			.expect("Cannot save current cluster layout");
-		Ok(())
-	}
-
 	fn update_local_status(&self) {
 		let mut new_si: NodeStatus = self.local_status.load().as_ref().clone();

-		let ring = self.ring.borrow();
-		new_si.cluster_layout_version = ring.layout.version;
-		new_si.cluster_layout_staging_hash = ring.layout.staging_hash;
+		new_si.layout_digest = self.layout_manager.layout().digest();

 		new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics);

@ -621,11 +567,6 @@ impl System {
 		Ok(SystemRpc::Ok)
 	}

-	fn handle_pull_cluster_layout(&self) -> SystemRpc {
-		let ring = self.ring.borrow().clone();
-		SystemRpc::AdvertiseClusterLayout(ring.layout.clone())
-	}
-
 	fn handle_get_known_nodes(&self) -> SystemRpc {
 		let known_nodes = self.get_known_nodes();
 		SystemRpc::ReturnKnownNodes(known_nodes)
@ -645,11 +586,8 @@ impl System {
 			std::process::exit(1);
 		}

-		if info.cluster_layout_version > local_info.cluster_layout_version
-			|| info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash
-		{
-			tokio::spawn(self.clone().pull_cluster_layout(from));
-		}
+		self.layout_manager
+			.handle_advertise_status(from, &info.layout_digest);

 		self.node_status
 			.write()
@ -659,57 +597,6 @@ impl System {
 		Ok(SystemRpc::Ok)
 	}

-	async fn handle_advertise_cluster_layout(
-		self: &Arc<Self>,
-		adv: &ClusterLayout,
-	) -> Result<SystemRpc, Error> {
-		if adv.replication_factor != self.replication_factor {
-			let msg = format!(
-				"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
-				adv.replication_factor,
-				self.replication_factor
-			);
-			error!("{}", msg);
-			return Err(Error::Message(msg));
-		}
-
-		let update_ring = self.update_ring.lock().await;
-		let mut layout: ClusterLayout = self.ring.borrow().layout.clone();
-
-		let prev_layout_check = layout.check().is_ok();
-		if layout.merge(adv) {
-			if prev_layout_check && layout.check().is_err() {
-				error!("New cluster layout is invalid, discarding.");
-				return Err(Error::Message(
-					"New cluster layout is invalid, discarding.".into(),
-				));
-			}
-
-			let ring = Ring::new(layout.clone(), self.replication_factor);
-			update_ring.send(Arc::new(ring))?;
-			drop(update_ring);
-
-			let self2 = self.clone();
-			tokio::spawn(async move {
-				if let Err(e) = self2
-					.rpc
-					.broadcast(
-						&self2.system_endpoint,
-						SystemRpc::AdvertiseClusterLayout(layout),
-						RequestStrategy::with_priority(PRIO_HIGH),
-					)
-					.await
-				{
-					warn!("Error while broadcasting new cluster layout: {}", e);
-				}
-			});
-
-			self.save_cluster_layout().await?;
-		}
-
-		Ok(SystemRpc::Ok)
-	}
-
 	async fn status_exchange_loop(&self, mut stop_signal: watch::Receiver<bool>) {
 		while !*stop_signal.borrow() {
 			let restart_at = Instant::now() + STATUS_EXCHANGE_INTERVAL;
@ -717,7 +604,7 @@ impl System {
 			self.update_local_status();
 			let local_status: NodeStatus = self.local_status.load().as_ref().clone();
 			let _ = self
-				.rpc
+				.rpc_helper()
 				.broadcast(
 					&self.system_endpoint,
 					SystemRpc::AdvertiseStatus(local_status),
@ -735,9 +622,9 @@ impl System {

 	async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
 		while !*stop_signal.borrow() {
-			let not_configured = self.ring.borrow().layout.check().is_err();
+			let not_configured = self.cluster_layout().check().is_err();
 			let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor;
-			let expected_n_nodes = self.ring.borrow().layout.num_nodes();
+			let expected_n_nodes = self.cluster_layout().all_nodes().len();
 			let bad_peers = self
 				.fullmesh
 				.get_peer_list()
@ -842,48 +729,49 @@ impl System {
 			.save_async(&PeerList(peer_list))
 			.await
 	}
-
-	async fn pull_cluster_layout(self: Arc<Self>, peer: Uuid) {
-		let resp = self
-			.rpc
-			.call(
-				&self.system_endpoint,
-				peer,
-				SystemRpc::PullClusterLayout,
-				RequestStrategy::with_priority(PRIO_HIGH),
-			)
-			.await;
-		if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
-			let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await;
-		}
-	}
 }

 #[async_trait]
 impl EndpointHandler<SystemRpc> for System {
 	async fn handle(self: &Arc<Self>, msg: &SystemRpc, from: NodeID) -> Result<SystemRpc, Error> {
 		match msg {
+			// ---- system functions -> System ----
 			SystemRpc::Connect(node) => self.handle_connect(node).await,
-			SystemRpc::PullClusterLayout => Ok(self.handle_pull_cluster_layout()),
 			SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await,
-			SystemRpc::AdvertiseClusterLayout(adv) => {
-				self.clone().handle_advertise_cluster_layout(adv).await
-			}
 			SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()),
+
+			// ---- layout functions -> LayoutManager ----
+			SystemRpc::PullClusterLayout => Ok(self.layout_manager.handle_pull_cluster_layout()),
+			SystemRpc::AdvertiseClusterLayout(adv) => {
+				self.layout_manager
+					.handle_advertise_cluster_layout(adv)
+					.await
+			}
+			SystemRpc::PullClusterLayoutTrackers => {
+				Ok(self.layout_manager.handle_pull_cluster_layout_trackers())
+			}
+			SystemRpc::AdvertiseClusterLayoutTrackers(adv) => {
+				self.layout_manager
+					.handle_advertise_cluster_layout_trackers(adv)
+					.await
+			}
+
+			// ---- other -> Error ----
 			m => Err(Error::unexpected_rpc_message(m)),
 		}
 	}
 }

 impl NodeStatus {
-	fn initial(replication_factor: usize, layout: &ClusterLayout) -> Self {
+	fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self {
 		NodeStatus {
-			hostname: gethostname::gethostname()
+			hostname: Some(
+				gethostname::gethostname()
 					.into_string()
 					.unwrap_or_else(|_| "<invalid utf-8>".to_string()),
+			),
 			replication_factor,
-			cluster_layout_version: layout.version,
-			cluster_layout_staging_hash: layout.staging_hash,
+			layout_digest: layout_manager.layout().digest(),
 			meta_disk_avail: None,
 			data_disk_avail: None,
 		}
@ -891,27 +779,14 @@ impl NodeStatus {

 	fn unknown() -> Self {
 		NodeStatus {
-			hostname: "?".to_string(),
+			hostname: None,
 			replication_factor: 0,
-			cluster_layout_version: 0,
-			cluster_layout_staging_hash: Hash::from([0u8; 32]),
+			layout_digest: Default::default(),
 			meta_disk_avail: None,
 			data_disk_avail: None,
 		}
 	}

-	#[cfg(windows)]
-	fn update_disk_usage(
-		&mut self,
-		_meta_dir: &Path,
-		_data_dir: &DataDirEnum,
-		_metrics: &SystemMetrics,
-	) {
-		// TODO(mediocregopher) it'd be nice to have this for windows too, but it seems to only be
-		// used for OpenTelemetry so it's not a real requirement.
-	}
-
-	#[cfg(not(windows))]
 	fn update_disk_usage(
 		&mut self,
 		meta_dir: &Path,
@ -919,7 +794,6 @@ impl NodeStatus {
 		metrics: &SystemMetrics,
 	) {
 		use nix::sys::statvfs::statvfs;
-		use std::sync::atomic::Ordering;
 		let mount_avail = |path: &Path| match statvfs(path) {
 			Ok(x) => {
 				let avail = x.blocks_available() as u64 * x.fragment_size() as u64;
@ -978,7 +852,6 @@ impl NodeStatus {
 	}
 }

-#[cfg(not(windows))]
 fn get_default_ip() -> Option<IpAddr> {
 	pnet_datalink::interfaces()
 		.iter()
@ -987,9 +860,38 @@ fn get_default_ip() -> Option<IpAddr> {
 		.map(|a| a.ip())
 }

-#[cfg(windows)]
-fn get_default_ip() -> Option<IpAddr> {
+fn get_rpc_public_addr(config: &Config) -> Option<SocketAddr> {
+	match &config.rpc_public_addr {
+		Some(a_str) => {
+			use std::net::ToSocketAddrs;
+			match a_str.to_socket_addrs() {
+				Err(e) => {
+					error!(
+						"Cannot resolve rpc_public_addr {} from config file: {}.",
+						a_str, e
+					);
 					None
+				}
+				Ok(a) => {
+					let a = a.collect::<Vec<_>>();
+					if a.is_empty() {
+						error!("rpc_public_addr {} resolve to no known IP address", a_str);
+					}
+					if a.len() > 1 {
+						warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a);
+					}
+					a.into_iter().next()
+				}
+			}
+		}
+		None => {
+			let addr = get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
+			if let Some(a) = addr {
+				warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a);
+			}
+			addr
+		}
+	}
 }

 async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> {
--- a/src/table/data.rs
+++ b/src/table/data.rs
@ -254,7 +254,8 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
 				// of the GC algorithm, as in all cases GC is suspended if
 				// any node of the partition is unavailable.
 				let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
-				let nodes = self.replication.write_nodes(&pk_hash);
+				// TODO: this probably breaks when the layout changes
+				let nodes = self.replication.storage_nodes(&pk_hash);
 				if nodes.first() == Some(&self.system.id) {
 					GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
 				}
--- a/src/table/gc.rs
+++ b/src/table/gc.rs
@ -152,7 +152,7 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
 		let mut partitions = HashMap::new();
 		for entry in entries {
 			let pkh = Hash::try_from(&entry.key[..32]).unwrap();
-			let mut nodes = self.data.replication.write_nodes(&pkh);
+			let mut nodes = self.data.replication.storage_nodes(&pkh);
 			nodes.retain(|x| *x != self.system.id);
 			nodes.sort();

@ -227,10 +227,10 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
 		// GC'ing is not a critical function of the system, so it's not a big
 		// deal if we can't do it right now.
 		self.system
-			.rpc
+			.rpc_helper()
 			.try_call_many(
 				&self.endpoint,
-				&nodes[..],
+				&nodes,
 				GcRpc::Update(updates),
 				RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
 			)
@ -248,10 +248,10 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
 		// it means that the garbage collection wasn't completed and has
 		// to be retried later.
 		self.system
-			.rpc
+			.rpc_helper()
 			.try_call_many(
 				&self.endpoint,
-				&nodes[..],
+				&nodes,
 				GcRpc::DeleteIfEqualHash(deletes),
 				RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
 			)
--- a/src/table/merkle.rs
+++ b/src/table/merkle.rs
@ -13,7 +13,7 @@ use garage_util::data::*;
 use garage_util::encode::{nonversioned_decode, nonversioned_encode};
 use garage_util::error::Error;

-use garage_rpc::ring::*;
+use garage_rpc::layout::*;

 use crate::data::*;
 use crate::replication::*;
--- a/src/table/replication/fullcopy.rs
+++ b/src/table/replication/fullcopy.rs
@ -1,15 +1,22 @@
 use std::sync::Arc;

-use garage_rpc::ring::*;
+use garage_rpc::layout::*;
 use garage_rpc::system::System;
 use garage_util::data::*;

 use crate::replication::*;

+// TODO: find a way to track layout changes for this as well
+// The hard thing is that this data is stored also on gateway nodes,
+// whereas sharded data is stored only on non-Gateway nodes (storage nodes)
+// Also we want to be more tolerant to failures of gateways so we don't
+// want to do too much holding back of data when progress of gateway
+// nodes is not reported in the layout history's ack/sync/sync_ack maps.
+
 /// Full replication schema: all nodes store everything
-/// Writes are disseminated in an epidemic manner in the network
 /// Advantage: do all reads locally, extremely fast
 /// Inconvenient: only suitable to reasonably small tables
+/// Inconvenient: if some writes fail, nodes will read outdated data
 #[derive(Clone)]
 pub struct TableFullReplication {
 	/// The membership manager of this node
@ -19,6 +26,13 @@ pub struct TableFullReplication {
 }

 impl TableReplication for TableFullReplication {
+	type WriteSets = Vec<Vec<Uuid>>;
+
+	fn storage_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
+		let layout = self.system.cluster_layout();
+		layout.current().all_nodes().to_vec()
+	}
+
 	fn read_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
 		vec![self.system.id]
 	}
@ -26,12 +40,11 @@ impl TableReplication for TableFullReplication {
 		1
 	}

-	fn write_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
-		let ring = self.system.ring.borrow();
-		ring.layout.node_ids().to_vec()
+	fn write_sets(&self, hash: &Hash) -> Self::WriteSets {
+		vec![self.storage_nodes(hash)]
 	}
 	fn write_quorum(&self) -> usize {
-		let nmembers = self.system.ring.borrow().layout.node_ids().len();
+		let nmembers = self.system.cluster_layout().current().all_nodes().len();
 		if nmembers > self.max_faults {
 			nmembers - self.max_faults
 		} else {
@ -45,7 +58,18 @@ impl TableReplication for TableFullReplication {
 	fn partition_of(&self, _hash: &Hash) -> Partition {
 		0u16
 	}
-	fn partitions(&self) -> Vec<(Partition, Hash)> {
-		vec![(0u16, [0u8; 32].into())]
+
+	fn sync_partitions(&self) -> SyncPartitions {
+		let layout = self.system.cluster_layout();
+		let layout_version = layout.current().version;
+		SyncPartitions {
+			layout_version,
+			partitions: vec![SyncPartition {
+				partition: 0u16,
+				first_hash: [0u8; 32].into(),
+				last_hash: [0xff; 32].into(),
+				storage_sets: vec![layout.current().all_nodes().to_vec()],
+			}],
+		}
 	}
 }
--- a/src/table/replication/parameters.rs
+++ b/src/table/replication/parameters.rs
@ -1,25 +1,44 @@
-use garage_rpc::ring::*;
+use garage_rpc::layout::*;
 use garage_util::data::*;

 /// Trait to describe how a table shall be replicated
 pub trait TableReplication: Send + Sync + 'static {
+	type WriteSets: AsRef<Vec<Vec<Uuid>>> + AsMut<Vec<Vec<Uuid>>> + Send + Sync + 'static;
+
 	// See examples in table_sharded.rs and table_fullcopy.rs
 	// To understand various replication methods

+	/// The entire list of all nodes that store a partition
+	fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid>;
+
 	/// Which nodes to send read requests to
 	fn read_nodes(&self, hash: &Hash) -> Vec<Uuid>;
 	/// Responses needed to consider a read succesfull
 	fn read_quorum(&self) -> usize;

 	/// Which nodes to send writes to
-	fn write_nodes(&self, hash: &Hash) -> Vec<Uuid>;
-	/// Responses needed to consider a write succesfull
+	fn write_sets(&self, hash: &Hash) -> Self::WriteSets;
+	/// Responses needed to consider a write succesfull in each set
 	fn write_quorum(&self) -> usize;
 	fn max_write_errors(&self) -> usize;

 	// Accessing partitions, for Merkle tree & sync
 	/// Get partition for data with given hash
 	fn partition_of(&self, hash: &Hash) -> Partition;
-	/// List of existing partitions
-	fn partitions(&self) -> Vec<(Partition, Hash)>;
+	/// List of partitions and nodes to sync with in current layout
+	fn sync_partitions(&self) -> SyncPartitions;
+}
+
+#[derive(Debug)]
+pub struct SyncPartitions {
+	pub layout_version: u64,
+	pub partitions: Vec<SyncPartition>,
+}
+
+#[derive(Debug)]
+pub struct SyncPartition {
+	pub partition: Partition,
+	pub first_hash: Hash,
+	pub last_hash: Hash,
+	pub storage_sets: Vec<Vec<Uuid>>,
 }
--- a/src/table/replication/sharded.rs
+++ b/src/table/replication/sharded.rs
@ -1,6 +1,6 @@
 use std::sync::Arc;

-use garage_rpc::ring::*;
+use garage_rpc::layout::*;
 use garage_rpc::system::System;
 use garage_util::data::*;

@ -25,17 +25,21 @@ pub struct TableShardedReplication {
 }

 impl TableReplication for TableShardedReplication {
+	type WriteSets = WriteLock<Vec<Vec<Uuid>>>;
+
+	fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid> {
+		self.system.cluster_layout().storage_nodes_of(hash)
+	}
+
 	fn read_nodes(&self, hash: &Hash) -> Vec<Uuid> {
-		let ring = self.system.ring.borrow();
-		ring.get_nodes(hash, self.replication_factor)
+		self.system.cluster_layout().read_nodes_of(hash)
 	}
 	fn read_quorum(&self) -> usize {
 		self.read_quorum
 	}

-	fn write_nodes(&self, hash: &Hash) -> Vec<Uuid> {
-		let ring = self.system.ring.borrow();
-		ring.get_nodes(hash, self.replication_factor)
+	fn write_sets(&self, hash: &Hash) -> Self::WriteSets {
+		self.system.layout_manager.write_sets_of(hash)
 	}
 	fn write_quorum(&self) -> usize {
 		self.write_quorum
@ -45,9 +49,38 @@ impl TableReplication for TableShardedReplication {
 	}

 	fn partition_of(&self, hash: &Hash) -> Partition {
-		self.system.ring.borrow().partition_of(hash)
+		self.system.cluster_layout().current().partition_of(hash)
+	}
+
+	fn sync_partitions(&self) -> SyncPartitions {
+		let layout = self.system.cluster_layout();
+		let layout_version = layout.ack_map_min();
+
+		let mut partitions = layout
+			.current()
+			.partitions()
+			.map(|(partition, first_hash)| {
+				let storage_sets = layout.storage_sets_of(&first_hash);
+				SyncPartition {
+					partition,
+					first_hash,
+					last_hash: [0u8; 32].into(), // filled in just after
+					storage_sets,
+				}
+			})
+			.collect::<Vec<_>>();
+
+		for i in 0..partitions.len() {
+			partitions[i].last_hash = if i + 1 < partitions.len() {
+				partitions[i + 1].first_hash
+			} else {
+				[0xFFu8; 32].into()
+			};
+		}
+
+		SyncPartitions {
+			layout_version,
+			partitions,
 		}
-	fn partitions(&self) -> Vec<(Partition, Hash)> {
-		self.system.ring.borrow().partitions()
 	}
 }
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@ -6,18 +6,19 @@ use arc_swap::ArcSwapOption;
 use async_trait::async_trait;
 use futures_util::stream::*;
 use opentelemetry::KeyValue;
-use rand::Rng;
+use rand::prelude::*;
 use serde::{Deserialize, Serialize};
 use serde_bytes::ByteBuf;
 use tokio::select;
-use tokio::sync::{mpsc, watch};
+use tokio::sync::{mpsc, watch, Notify};

 use garage_util::background::*;
 use garage_util::data::*;
 use garage_util::encode::{debug_serialize, nonversioned_encode};
 use garage_util::error::{Error, OkOrMessage};

-use garage_rpc::ring::*;
+use garage_rpc::layout::*;
+use garage_rpc::rpc_helper::QuorumSetResultTracker;
 use garage_rpc::system::System;
 use garage_rpc::*;

@ -52,16 +53,6 @@ impl Rpc for SyncRpc {
 	type Response = Result<SyncRpc, Error>;
 }

-#[derive(Debug, Clone)]
-struct TodoPartition {
-	partition: Partition,
-	begin: Hash,
-	end: Hash,
-
-	// Are we a node that stores this partition or not?
-	retain: bool,
-}
-
 impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {
 	pub(crate) fn new(
 		system: Arc<System>,
@ -91,10 +82,10 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {

 		bg.spawn_worker(SyncWorker {
 			syncer: self.clone(),
-			ring_recv: self.system.ring.clone(),
-			ring: self.system.ring.borrow().clone(),
+			layout_notify: self.system.layout_notify(),
+			layout_digest: self.system.cluster_layout().sync_digest(),
 			add_full_sync_rx,
-			todo: vec![],
+			todo: None,
 			next_full_sync: Instant::now() + Duration::from_secs(20),
 		});
 	}
@ -112,54 +103,57 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {

 	async fn sync_partition(
 		self: &Arc<Self>,
-		partition: &TodoPartition,
+		partition: &SyncPartition,
 		must_exit: &mut watch::Receiver<bool>,
 	) -> Result<(), Error> {
-		if partition.retain {
 		let my_id = self.system.id;
+		let retain = partition.storage_sets.iter().any(|x| x.contains(&my_id));

-			let nodes = self
-				.data
-				.replication
-				.write_nodes(&partition.begin)
-				.into_iter()
-				.filter(|node| *node != my_id)
-				.collect::<Vec<_>>();
-
+		if retain {
 			debug!(
 				"({}) Syncing {:?} with {:?}...",
 				F::TABLE_NAME,
 				partition,
-				nodes
+				partition.storage_sets
 			);
-			let mut sync_futures = nodes
-				.iter()
+			let mut result_tracker = QuorumSetResultTracker::new(
+				&partition.storage_sets,
+				self.data.replication.write_quorum(),
+			);
+
+			let mut sync_futures = result_tracker
+				.nodes
+				.keys()
+				.copied()
 				.map(|node| {
-					self.clone()
-						.do_sync_with(partition.clone(), *node, must_exit.clone())
+					let must_exit = must_exit.clone();
+					async move {
+						if node == my_id {
+							(node, Ok(()))
+						} else {
+							(node, self.do_sync_with(partition, node, must_exit).await)
+						}
+					}
 				})
 				.collect::<FuturesUnordered<_>>();

-			let mut n_errors = 0;
-			while let Some(r) = sync_futures.next().await {
-				if let Err(e) = r {
-					n_errors += 1;
-					warn!("({}) Sync error: {}", F::TABLE_NAME, e);
+			while let Some((node, res)) = sync_futures.next().await {
+				if let Err(e) = &res {
+					warn!("({}) Sync error with {:?}: {}", F::TABLE_NAME, node, e);
 				}
-			}
-			if n_errors > self.data.replication.max_write_errors() {
-				return Err(Error::Message(format!(
-					"Sync failed with too many nodes (should have been: {:?}).",
-					nodes
-				)));
-			}
-		} else {
-			self.offload_partition(&partition.begin, &partition.end, must_exit)
-				.await?;
+				result_tracker.register_result(node, res);
 			}

+			if result_tracker.too_many_failures() {
+				Err(result_tracker.quorum_error())
+			} else {
 				Ok(())
 			}
+		} else {
+			self.offload_partition(&partition.first_hash, &partition.last_hash, must_exit)
+				.await
+		}
+	}

 	// Offload partition: this partition is not something we are storing,
 	// so send it out to all other nodes that store it and delete items locally.
@ -188,12 +182,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {
 			}

 			if !items.is_empty() {
-				let nodes = self
-					.data
-					.replication
-					.write_nodes(begin)
-					.into_iter()
-					.collect::<Vec<_>>();
+				let nodes = self.data.replication.storage_nodes(begin);
 				if nodes.contains(&self.system.id) {
 					warn!(
 						"({}) Interrupting offload as partitions seem to have changed",
@ -217,7 +206,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {
 					end,
 					counter
 				);
-				self.offload_items(&items, &nodes[..]).await?;
+				self.offload_items(&items, &nodes).await?;
 			} else {
 				break;
 			}
@ -244,7 +233,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {
 		}

 		self.system
-			.rpc
+			.rpc_helper()
 			.try_call_many(
 				&self.endpoint,
 				nodes,
@ -284,8 +273,8 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {
 	}

 	async fn do_sync_with(
-		self: Arc<Self>,
-		partition: TodoPartition,
+		self: &Arc<Self>,
+		partition: &SyncPartition,
 		who: Uuid,
 		must_exit: watch::Receiver<bool>,
 	) -> Result<(), Error> {
@ -305,7 +294,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {
 		// If so, do nothing.
 		let root_resp = self
 			.system
-			.rpc
+			.rpc_helper()
 			.call(
 				&self.endpoint,
 				who,
@ -361,7 +350,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {
 					// and compare it with local node
 					let remote_node = match self
 						.system
-						.rpc
+						.rpc_helper()
 						.call(
 							&self.endpoint,
 							who,
@ -437,7 +426,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> {

 		let rpc_resp = self
 			.system
-			.rpc
+			.rpc_helper()
 			.call(
 				&self.endpoint,
 				who,
@ -492,76 +481,42 @@ impl<F: TableSchema, R: TableReplication> EndpointHandler<SyncRpc> for TableSync

 struct SyncWorker<F: TableSchema, R: TableReplication> {
 	syncer: Arc<TableSyncer<F, R>>,
-	ring_recv: watch::Receiver<Arc<Ring>>,
-	ring: Arc<Ring>,
+
+	layout_notify: Arc<Notify>,
+	layout_digest: SyncLayoutDigest,
+
 	add_full_sync_rx: mpsc::UnboundedReceiver<()>,
-	todo: Vec<TodoPartition>,
 	next_full_sync: Instant,
+
+	todo: Option<SyncPartitions>,
 }

 impl<F: TableSchema, R: TableReplication> SyncWorker<F, R> {
+	fn check_add_full_sync(&mut self) {
+		let layout_digest = self.syncer.system.cluster_layout().sync_digest();
+		if layout_digest != self.layout_digest {
+			self.layout_digest = layout_digest;
+			info!(
+				"({}) Layout versions changed ({:?}), adding full sync to syncer todo list",
+				F::TABLE_NAME,
+				layout_digest,
+			);
+			self.add_full_sync();
+		}
+	}
+
 	fn add_full_sync(&mut self) {
-		let system = &self.syncer.system;
-		let data = &self.syncer.data;
-
-		let my_id = system.id;
-
-		self.todo.clear();
-
-		let partitions = data.replication.partitions();
-
-		for i in 0..partitions.len() {
-			let begin = partitions[i].1;
-
-			let end = if i + 1 < partitions.len() {
-				partitions[i + 1].1
-			} else {
-				[0xFFu8; 32].into()
-			};
-
-			let nodes = data.replication.write_nodes(&begin);
-
-			let retain = nodes.contains(&my_id);
-			if !retain {
-				// Check if we have some data to send, otherwise skip
-				match data.store.range(begin..end) {
-					Ok(mut iter) => {
-						if iter.next().is_none() {
-							continue;
-						}
-					}
-					Err(e) => {
-						warn!("DB error in add_full_sync: {}", e);
-						continue;
-					}
-				}
-			}
-
-			self.todo.push(TodoPartition {
-				partition: partitions[i].0,
-				begin,
-				end,
-				retain,
-			});
-		}
+		let mut partitions = self.syncer.data.replication.sync_partitions();
+		info!(
+			"{}: Adding full sync for ack layout version {}",
+			F::TABLE_NAME,
+			partitions.layout_version
+		);

+		partitions.partitions.shuffle(&mut thread_rng());
+		self.todo = Some(partitions);
 		self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL;
 	}
-
-	fn pop_task(&mut self) -> Option<TodoPartition> {
-		if self.todo.is_empty() {
-			return None;
-		}
-
-		let i = rand::thread_rng().gen_range(0..self.todo.len());
-		if i == self.todo.len() - 1 {
-			self.todo.pop()
-		} else {
-			let replacement = self.todo.pop().unwrap();
-			let ret = std::mem::replace(&mut self.todo[i], replacement);
-			Some(ret)
-		}
-	}
 }

 #[async_trait]
@ -572,14 +527,48 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {

 	fn status(&self) -> WorkerStatus {
 		WorkerStatus {
-			queue_length: Some(self.todo.len() as u64),
+			queue_length: Some(self.todo.as_ref().map(|x| x.partitions.len()).unwrap_or(0) as u64),
 			..Default::default()
 		}
 	}

 	async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
-		if let Some(partition) = self.pop_task() {
-			self.syncer.sync_partition(&partition, must_exit).await?;
+		self.check_add_full_sync();
+
+		if let Some(todo) = &mut self.todo {
+			let partition = todo.partitions.pop().unwrap();
+
+			// process partition
+			if let Err(e) = self.syncer.sync_partition(&partition, must_exit).await {
+				error!(
+					"{}: Failed to sync partition {:?}: {}",
+					F::TABLE_NAME,
+					partition,
+					e
+				);
+				// if error, put partition back at the other side of the queue,
+				// so that other partitions will be tried in the meantime
+				todo.partitions.insert(0, partition);
+				// TODO: returning an error here will cause the background job worker
+				// to delay this task for some time, but maybe we don't want to
+				// delay it if there are lots of failures from nodes that are gone
+				// (we also don't want zero delays as that will cause lots of useless retries)
+				return Err(e);
+			}
+
+			if todo.partitions.is_empty() {
+				info!(
+					"{}: Completed full sync for ack layout version {}",
+					F::TABLE_NAME,
+					todo.layout_version
+				);
+				self.syncer
+					.system
+					.layout_manager
+					.sync_table_until(F::TABLE_NAME, todo.layout_version);
+				self.todo = None;
+			}
+
 			Ok(WorkerState::Busy)
 		} else {
 			Ok(WorkerState::Idle)
@ -593,22 +582,16 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {
 					self.add_full_sync();
 				}
 			},
-			_ = self.ring_recv.changed() => {
-				let new_ring = self.ring_recv.borrow();
-				if !Arc::ptr_eq(&new_ring, &self.ring) {
-					self.ring = new_ring.clone();
-					drop(new_ring);
-					debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME);
-					self.add_full_sync();
-				}
+			_ = self.layout_notify.notified() => {
+				self.check_add_full_sync();
 			},
 			_ = tokio::time::sleep_until(self.next_full_sync.into()) => {
 				self.add_full_sync();
 			}
 		}
-		match self.todo.is_empty() {
-			false => WorkerState::Busy,
-			true => WorkerState::Idle,
+		match self.todo.is_some() {
+			true => WorkerState::Busy,
+			false => WorkerState::Idle,
 		}
 	}
 }
--- a/src/table/table.rs
+++ b/src/table/table.rs
@ -20,6 +20,7 @@ use garage_util::error::Error;
 use garage_util::metrics::RecordDuration;
 use garage_util::migrate::Migrate;

+use garage_rpc::rpc_helper::QuorumSetResultTracker;
 use garage_rpc::system::System;
 use garage_rpc::*;

@ -80,6 +81,8 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {
 		let syncer = TableSyncer::new(system.clone(), data.clone(), merkle_updater.clone());
 		let gc = TableGc::new(system.clone(), data.clone());

+		system.layout_manager.add_table(F::TABLE_NAME);
+
 		let table = Arc::new(Self {
 			system,
 			data,
@ -117,16 +120,16 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {

 	async fn insert_internal(&self, e: &F::E) -> Result<(), Error> {
 		let hash = e.partition_key().hash();
-		let who = self.data.replication.write_nodes(&hash);
+		let who = self.data.replication.write_sets(&hash);

 		let e_enc = Arc::new(ByteBuf::from(e.encode()?));
 		let rpc = TableRpc::<F>::Update(vec![e_enc]);

 		self.system
-			.rpc
-			.try_call_many(
+			.rpc_helper()
+			.try_write_many_sets(
 				&self.endpoint,
-				&who[..],
+				who.as_ref(),
 				rpc,
 				RequestStrategy::with_priority(PRIO_NORMAL)
 					.with_quorum(self.data.replication.write_quorum()),
@ -141,7 +144,7 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {
 		self.data.queue_insert(tx, e)
 	}

-	pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error>
+	pub async fn insert_many<I, IE>(self: &Arc<Self>, entries: I) -> Result<(), Error>
 	where
 		I: IntoIterator<Item = IE> + Send + Sync,
 		IE: Borrow<F::E> + Send + Sync,
@ -159,51 +162,123 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {
 		Ok(())
 	}

-	async fn insert_many_internal<I, IE>(&self, entries: I) -> Result<(), Error>
+	async fn insert_many_internal<I, IE>(self: &Arc<Self>, entries: I) -> Result<(), Error>
 	where
 		I: IntoIterator<Item = IE> + Send + Sync,
 		IE: Borrow<F::E> + Send + Sync,
 	{
-		let mut call_list: HashMap<_, Vec<_>> = HashMap::new();
+		// The different items will have to be stored on possibly different nodes.
+		// We will here batch all items into a single request for each concerned
+		// node, with all of the entries it must store within that request.
+		// Each entry has to be saved to a specific list of "write sets", i.e. a set
+		// of node within wich a quorum must be achieved. In normal operation, there
+		// is a single write set which corresponds to the quorum in the current
+		// cluster layout, but when the layout is updated, multiple write sets might
+		// have to be handled at once. Here, since we are sending many entries, we
+		// will have to handle many write sets in all cases. The algorihtm is thus
+		// to send one request to each node with all the items it must save,
+		// and keep track of the OK responses within each write set: if for all sets
+		// a quorum of nodes has answered OK, then the insert has succeeded and
+		// consistency properties (read-after-write) are preserved.

+		let quorum = self.data.replication.write_quorum();
+
+		// Serialize all entries and compute the write sets for each of them.
+		// In the case of sharded table replication, this also takes an "ack lock"
+		// to the layout manager to avoid ack'ing newer versions which are not
+		// taken into account by writes in progress (the ack can happen later, once
+		// all writes that didn't take the new layout into account are finished).
+		// These locks are released when entries_vec is dropped, i.e. when this
+		// function returns.
+		let mut entries_vec = Vec::new();
 		for entry in entries.into_iter() {
 			let entry = entry.borrow();
 			let hash = entry.partition_key().hash();
-			let who = self.data.replication.write_nodes(&hash);
+			let mut write_sets = self.data.replication.write_sets(&hash);
+			for set in write_sets.as_mut().iter_mut() {
+				// Sort nodes in each write sets to merge write sets with same
+				// nodes but in possibly different orders
+				set.sort();
+			}
 			let e_enc = Arc::new(ByteBuf::from(entry.encode()?));
-			for node in who {
-				call_list.entry(node).or_default().push(e_enc.clone());
+			entries_vec.push((write_sets, e_enc));
+		}
+
+		// Compute a deduplicated list of all of the write sets,
+		// and compute an index from each node to the position of the sets in which
+		// it takes part, to optimize the detection of a quorum.
+		let mut write_sets = entries_vec
+			.iter()
+			.flat_map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice()))
+			.collect::<Vec<&[Uuid]>>();
+		write_sets.sort();
+		write_sets.dedup();
+
+		let mut result_tracker = QuorumSetResultTracker::new(&write_sets, quorum);
+
+		// Build a map of all nodes to the entries that must be sent to that node.
+		let mut call_list: HashMap<Uuid, Vec<_>> = HashMap::new();
+		for (write_sets, entry_enc) in entries_vec.iter() {
+			for write_set in write_sets.as_ref().iter() {
+				for node in write_set.iter() {
+					let node_entries = call_list.entry(*node).or_default();
+					match node_entries.last() {
+						Some(x) if Arc::ptr_eq(x, entry_enc) => {
+							// skip if entry already in list to send to this node
+							// (could happen if node is in several write sets for this entry)
+						}
+						_ => {
+							node_entries.push(entry_enc.clone());
+						}
+					}
+				}
 			}
 		}

-		let call_futures = call_list.drain().map(|(node, entries)| async move {
+		// Build futures to actually perform each of the corresponding RPC calls
+		let call_futures = call_list.into_iter().map(|(node, entries)| {
+			let this = self.clone();
+			async move {
 				let rpc = TableRpc::<F>::Update(entries);
-
-			let resp = self
+				let resp = this
 					.system
-				.rpc
+					.rpc_helper()
 					.call(
-					&self.endpoint,
+						&this.endpoint,
 						node,
 						rpc,
-					RequestStrategy::with_priority(PRIO_NORMAL),
+						RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(quorum),
 					)
-				.await?;
-			Ok::<_, Error>((node, resp))
+					.await;
+				(node, resp)
+			}
 		});
-		let mut resps = call_futures.collect::<FuturesUnordered<_>>();
-		let mut errors = vec![];

-		while let Some(resp) = resps.next().await {
-			if let Err(e) = resp {
-				errors.push(e);
+		// Run all requests in parallel thanks to FuturesUnordered, and collect results.
+		let mut resps = call_futures.collect::<FuturesUnordered<_>>();
+
+		while let Some((node, resp)) = resps.next().await {
+			result_tracker.register_result(node, resp.map(|_| ()));
+
+			if result_tracker.all_quorums_ok() {
+				// Success
+
+				// Continue all other requests in background
+				tokio::spawn(async move {
+					resps.collect::<Vec<(Uuid, Result<_, _>)>>().await;
+				});
+
+				return Ok(());
+			}
+
+			if result_tracker.too_many_failures() {
+				// Too many errors in this set, we know we won't get a quorum
+				break;
 			}
 		}
-		if errors.len() > self.data.replication.max_write_errors() {
-			Err(Error::Message("Too many errors".into()))
-		} else {
-			Ok(())
-		}
+
+		// Failure, could not get quorum within at least one set
+		Err(result_tracker.quorum_error())
 	}

 	pub async fn get(
@ -236,14 +311,13 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {
 		let rpc = TableRpc::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
 		let resps = self
 			.system
-			.rpc
+			.rpc_helper()
 			.try_call_many(
 				&self.endpoint,
-				&who[..],
+				&who,
 				rpc,
 				RequestStrategy::with_priority(PRIO_NORMAL)
-					.with_quorum(self.data.replication.read_quorum())
-					.interrupt_after_quorum(true),
+					.with_quorum(self.data.replication.read_quorum()),
 			)
 			.await?;

@ -332,14 +406,13 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {

 		let resps = self
 			.system
-			.rpc
+			.rpc_helper()
 			.try_call_many(
 				&self.endpoint,
-				&who[..],
+				&who,
 				rpc,
 				RequestStrategy::with_priority(PRIO_NORMAL)
-					.with_quorum(self.data.replication.read_quorum())
-					.interrupt_after_quorum(true),
+					.with_quorum(self.data.replication.read_quorum()),
 			)
 			.await?;

@ -411,7 +484,7 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> {
 	async fn repair_on_read(&self, who: &[Uuid], what: F::E) -> Result<(), Error> {
 		let what_enc = Arc::new(ByteBuf::from(what.encode()?));
 		self.system
-			.rpc
+			.rpc_helper()
 			.try_call_many(
 				&self.endpoint,
 				who,
--- a/src/util/error.rs
+++ b/src/util/error.rs
@ -55,13 +55,14 @@ pub enum Error {
 	Timeout,

 	#[error(
-		display = "Could not reach quorum of {}. {} of {} request succeeded, others returned errors: {:?}",
+		display = "Could not reach quorum of {} (sets={:?}). {} of {} request succeeded, others returned errors: {:?}",
 		_0,
 		_1,
 		_2,
-		_3
+		_3,
+		_4
 	)]
-	Quorum(usize, usize, usize, Vec<String>),
+	Quorum(usize, Option<usize>, usize, usize, Vec<String>),

 	#[error(display = "Unexpected RPC message: {}", _0)]
 	UnexpectedRpcMessage(String),
--- a/src/web/Cargo.toml
+++ b/src/web/Cargo.toml
@ -27,7 +27,7 @@ futures = "0.3"

 http = "0.2"
 hyper = { version = "0.14", features = ["server", "http1", "runtime", "tcp", "stream"] }
-#hyperlocal = { version = "0.8.0", default-features = false, features = ["server"] }
+hyperlocal = { version = "0.8.0", default-features = false, features = ["server"] }

 tokio = { version = "1.0", default-features = false, features = ["net"] }

--- a/src/web/web_server.rs
+++ b/src/web/web_server.rs
@ -1,5 +1,5 @@
-//use std::fs::{self, Permissions};
-//use std::os::unix::prelude::PermissionsExt;
+use std::fs::{self, Permissions};
+use std::os::unix::prelude::PermissionsExt;
 use std::{convert::Infallible, sync::Arc};

 use futures::future::Future;
@ -11,9 +11,9 @@ use hyper::{
 	Body, Method, Request, Response, Server, StatusCode,
 };

-//use hyperlocal::UnixServerExt;
+use hyperlocal::UnixServerExt;

-//use tokio::net::UnixStream;
+use tokio::net::UnixStream;

 use opentelemetry::{
 	global,
@ -100,18 +100,18 @@ impl WebServer {
 			}
 		});

-		//let unix_service = make_service_fn(|_: &UnixStream| {
-		//	let web_server = web_server.clone();
+		let unix_service = make_service_fn(|_: &UnixStream| {
+			let web_server = web_server.clone();

-		//	let path = addr.to_string();
-		//	async move {
-		//		remove_unix_socket_if_present(&path).await.expect("could not remove existing unix socket");
-		//		Ok::<_, Error>(service_fn(move |req: Request<Body>| {
-		//			let web_server = web_server.clone();
-		//			web_server.handle_request(req, path.clone())
-		//		}))
-		//	}
-		//});
+			let path = addr.to_string();
+			async move {
+				Ok::<_, Error>(service_fn(move |req: Request<Body>| {
+					let web_server = web_server.clone();
+
+					web_server.handle_request(req, path.clone())
+				}))
+			}
+		});

 		info!("Web server listening on {}", addr);

@ -122,22 +122,20 @@ impl WebServer {
 					.with_graceful_shutdown(shutdown_signal)
 					.await?
 			}
-			UnixOrTCPSocketAddress::UnixSocket(_path) => {
-				panic!("Unix sockets are not supported in this fork") // TODO(mediocregopher)
-			} //UnixOrTCPSocketAddress::UnixSocket(ref path) => {
-			  //	if path.exists() {
-			  //		fs::remove_file(path)?
-			  //	}
+			UnixOrTCPSocketAddress::UnixSocket(ref path) => {
+				if path.exists() {
+					fs::remove_file(path)?
+				}

-			  //	let bound = Server::bind_unix(path)?;
+				let bound = Server::bind_unix(path)?;

-			  //	fs::set_permissions(path, Permissions::from_mode(0o222))?;
+				fs::set_permissions(path, Permissions::from_mode(0o222))?;

-			  //	bound
-			  //		.serve(unix_service)
-			  //		.with_graceful_shutdown(shutdown_signal)
-			  //		.await?;
-			  //}
+				bound
+					.serve(unix_service)
+					.with_graceful_shutdown(shutdown_signal)
+					.await?;
+			}
 		};

 		Ok(())
Author	SHA1	Message	Date
Alex Auvolat	0041b013a4	layout: refactoring and fix in layout helper	2023-12-11 16:09:22 +01:00
Alex Auvolat	adccce1145	layout: refactor/fix bad while loop	2023-12-11 15:45:14 +01:00
Alex Auvolat	85b5a6bcd1	fix some clippy lints	2023-12-11 15:31:47 +01:00
Alex Auvolat	e4f493b481	table: remove redundant tracing in insert_many	2023-12-11 14:57:42 +01:00
Alex Auvolat	f8df90b79b	table: fix insert_many to not send duplicates	2023-12-08 14:54:11 +01:00
Alex Auvolat	4dbf254512	layout: refactoring, merge two files	2023-12-08 14:15:52 +01:00
Alex Auvolat	64a6e557a4	rpc helper: small refactorings	2023-12-08 12:18:12 +01:00
Alex Auvolat	5dd200c015	layout: move block_read_nodes_of to rpc_helper to avoid double-locking (in theory, this could have caused a deadlock)	2023-12-08 12:02:24 +01:00
Alex Auvolat	063294dd56	layout version: refactor get_node_zone	2023-12-08 11:50:58 +01:00
Alex Auvolat	7f2541101f	cli: improvements to the layout commands when multiple layouts are live	2023-12-08 11:24:23 +01:00
Alex Auvolat	91b874c4ef	rpc: fix system::health	2023-12-08 10:36:37 +01:00
Alex Auvolat	431b28e0cf	fix build with discovery features	2023-12-07 15:15:59 +01:00
Alex Auvolat	9cecea64d4	layout: allow sync update tracker to progress with only quorums	2023-12-07 14:51:20 +01:00
Alex Auvolat	aa59059a91	layout cli: safer skip-dead-nodes command	2023-12-07 11:56:14 +01:00
Alex Auvolat	d90de365b3	table sync: use write quorums to report global success or failure of sync	2023-12-07 11:16:10 +01:00
Alex Auvolat	95eb13eb08	rpc: refactor result tracking for quorum sets	2023-12-07 10:57:21 +01:00
Alex Auvolat	c8356a91d9	layout updates: fix the set of nodes among which minima are calculated	2023-12-07 10:30:26 +01:00
Alex Auvolat	c04dd8788a	admin: more info in admin GetClusterStatus	2023-11-28 14:25:04 +01:00
Alex Auvolat	539af6eac4	rpc helper: write comments + small refactoring of tracing	2023-11-28 11:12:39 +01:00
Alex Auvolat	c539077d30	cli: remove historic layout info from status	2023-11-27 16:22:27 +01:00
Alex Auvolat	11e6fef93c	cli: add layout history and layout assume-sync commands	2023-11-27 16:22:25 +01:00
Alex Auvolat	539a920313	cli: show when nodes are draining metadata	2023-11-27 13:18:59 +01:00
Alex Auvolat	78362140f5	rpc: update system::health to take into account write sets for all partitions	2023-11-27 12:10:21 +01:00
Alex Auvolat	d6d239fc79	block manager: read_block using old layout versions if necessary	2023-11-27 11:52:57 +01:00
Alex Auvolat	3ecd14b9f6	table: implement write sets for insert_many	2023-11-16 16:41:45 +01:00
Alex Auvolat	22f38808e7	rpc_helper: don't use tokio::spawn for individual requests	2023-11-16 16:34:01 +01:00
Alex Auvolat	707442f5de	layout: refactor digests and add "!=" assertions before epidemic bcast	2023-11-16 13:51:40 +01:00
Alex Auvolat	ad5c6f779f	layout: split helper in separate file; more precise difference tracking	2023-11-16 13:26:43 +01:00
Alex Auvolat	d4df03424f	layout: fix test	2023-11-15 15:56:57 +01:00
Alex Auvolat	33c8a489b0	layou: implement ack locking	2023-11-15 15:40:44 +01:00
Alex Auvolat	393c4d4515	layout: add helper for cached/external values to centralize recomputation	2023-11-15 14:20:50 +01:00
Alex Auvolat	65066c7064	layout: wip cache global mins	2023-11-15 13:28:30 +01:00
Alex Auvolat	acd49de9f9	rpc: fix write set quorums	2023-11-15 13:07:42 +01:00
Alex Auvolat	46007bf01d	integration test: print stdout and stderr on subcommand crash	2023-11-15 12:56:52 +01:00
Alex Auvolat	b3e729f4b8	layout history merge: rm invalid versions when valid versions are added	2023-11-15 12:15:58 +01:00
Alex Auvolat	7ef2c23120	layout: fix test	2023-11-14 15:45:01 +01:00
Alex Auvolat	90e1619b1e	table: take into account multiple write sets in inserts	2023-11-14 15:40:46 +01:00
Alex Auvolat	3b361d2959	layout: prepare for write sets	2023-11-14 14:28:16 +01:00
Alex Auvolat	866196750f	system: add todo wrt new layout	2023-11-14 13:36:58 +01:00
Alex Auvolat	83a11374ca	layout: fixes in schema	2023-11-14 13:29:26 +01:00
Alex Auvolat	1aab1f4e68	layout: refactoring of all_nodes	2023-11-14 13:12:32 +01:00
Alex Auvolat	8e292e06b3	layout: some refactoring of nongateway nodes	2023-11-14 12:48:38 +01:00
Alex Auvolat	9a491fa137	layout: fix test	2023-11-11 13:10:59 +01:00
Alex Auvolat	df24bb806d	layout/sync: fix bugs and add tracing	2023-11-11 12:44:27 +01:00
Alex Auvolat	ce89d1ddab	table sync: adapt to new layout history	2023-11-11 12:08:32 +01:00
Alex Auvolat	df36cf3099	layout: add helpers to LayoutHistory and prepare integration with Table	2023-11-09 16:32:31 +01:00
Alex Auvolat	9d95f6f704	layout: fix tracker bugs	2023-11-09 15:52:45 +01:00
Alex Auvolat	bad7cc812e	layout admin: add missing calls to update_hash	2023-11-09 15:42:10 +01:00
Alex Auvolat	03ebf18830	layout: begin managing the update tracker values	2023-11-09 15:31:59 +01:00
Alex Auvolat	94caf9c0c1	layout: separate code path for synchronizing update trackers only	2023-11-09 14:53:34 +01:00
Alex Auvolat	bfb1845fdc	layout: refactor to use a RwLock on LayoutHistory	2023-11-09 14:12:05 +01:00
Alex Auvolat	19ef1ec8e7	layout: more refactoring	2023-11-09 13:34:14 +01:00
Alex Auvolat	8a2b1dd422	wip: split out layout management from System into separate LayoutManager	2023-11-09 12:55:36 +01:00
Alex Auvolat	523d2ecb95	layout: use separate CRDT for staged layout changes	2023-11-09 11:19:43 +01:00
Alex Auvolat	1da0a5676e	bump garage protocol version tag to 0x000A (0.10)	2023-11-08 19:30:58 +01:00
Alex Auvolat	8dccee3ccf	cluster layout: adapt all uses of ClusterLayout to LayoutHistory	2023-11-08 19:28:36 +01:00
Alex Auvolat	fe9af1dcaa	WIP: garage_rpc: store layout version history	2023-11-08 17:49:06 +01:00
Alex Auvolat	4a9c94514f	avoid using layout_watch in System directly	2023-11-08 16:41:00 +01:00
Alex Auvolat	12d1dbfc6b	remove Ring and use ClusterLayout everywhere	2023-11-08 15:41:24 +01:00
Alex Auvolat	0962313ebd	garage_rpc: reorder functions in layout.rs	2023-11-08 13:13:04 +01:00