Small change to partition assignation algorithm

This change helps ensure that nodes for each partition are spread over all datacenters, a property that wasn't ensured previously when going from a 2 DC deployment to a 3 DC deployment
2022-02-10 16:10:21 +01:00 · 2022-02-10 16:10:21 +01:00 · 413ab0eaed
commit 413ab0eaed
parent 43945234ae
2 changed files with 37 additions and 9 deletions
--- a/src/garage/cli/layout.rs
+++ b/src/garage/cli/layout.rs
@ -196,6 +196,15 @@ pub async fn cmd_apply_layout(
 ) -> Result<(), Error> {
 	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;

+	layout.roles.merge(&layout.staging);
+
+	if !layout.calculate_partition_assignation() {
+		return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
+	}
+
+	layout.staging.clear();
+	layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
+
 	match apply_opt.version {
 		None => {
 			println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
@ -209,15 +218,6 @@ pub async fn cmd_apply_layout(
 		}
 	}

-	layout.roles.merge(&layout.staging);
-
-	if !layout.calculate_partition_assignation() {
-		return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
-	}
-
-	layout.staging.clear();
-	layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
-
 	layout.version += 1;

 	send_layout(rpc_cli, rpc_host, layout).await?;
--- a/src/rpc/layout.rs
+++ b/src/rpc/layout.rs
@ -172,12 +172,38 @@ impl ClusterLayout {
 		println!("Calculating updated partition assignation, this may take some time...");
 		println!();

+		// Get old partition assignation
 		let old_partitions = self.parse_assignation_data();

+		// Create new partition assignation starting from old one
 		let mut partitions = old_partitions.clone();
+
+		// Cleanup steps in new partition assignation:
+		let min_keep_nodes_per_part = (self.replication_factor + 1) / 2;
 		for part in partitions.iter_mut() {
+			// - remove from assignation nodes that don't have a role in the layout anymore
 			part.nodes
 				.retain(|(_, info)| info.map(|x| x.capacity.is_some()).unwrap_or(false));
+
+			// - remove from assignation some nodes that are in the same datacenter
+			//   if we can, so that the later steps can ensure datacenter variety
+			//   as much as possible (but still under the constraint that each partition
+			//   should not move from at least a certain number of nodes that is
+			//   min_keep_nodes_per_part)
+			'rmloop: while part.nodes.len() > min_keep_nodes_per_part {
+				let mut zns_c = HashMap::<&str, usize>::new();
+				for (_id, info) in part.nodes.iter() {
+					*zns_c.entry(info.unwrap().zone.as_str()).or_insert(0) += 1;
+				}
+				for i in 0..part.nodes.len() {
+					if zns_c[part.nodes[i].1.unwrap().zone.as_str()] > 1 {
+						part.nodes.remove(i);
+						continue 'rmloop;
+					}
+				}
+
+				break;
+			}
 		}

 		// When nodes are removed, or when bootstraping an assignation from
@ -196,6 +222,8 @@ impl ClusterLayout {
 				}
 			}
 			None => {
+				// Not enough nodes in cluster to build a correct assignation.
+				// Signal it by returning an error.
 				return false;
 			}
 		}