cli: show when nodes are draining metadata

This commit is contained in:
Alex Auvolat 2023-11-27 13:18:59 +01:00
parent 78362140f5
commit 539a920313
No known key found for this signature in database
GPG Key ID: 0E496D15096376BE

View File

@ -1,4 +1,4 @@
use std::collections::HashSet; use std::collections::{HashMap, HashSet};
use std::time::Duration; use std::time::Duration;
use format_table::format_table; use format_table::format_table;
@ -62,35 +62,69 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
let mut healthy_nodes = let mut healthy_nodes =
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()];
for adv in status.iter().filter(|adv| adv.is_up) { for adv in status.iter().filter(|adv| adv.is_up) {
match layout.current().roles.get(&adv.id) { if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) {
Some(NodeRoleV(Some(cfg))) => { let data_avail = match &adv.status.data_disk_avail {
let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(),
_ if cfg.capacity.is_none() => "N/A".into(), Some((avail, total)) => {
Some((avail, total)) => { let pct = (*avail as f64) / (*total as f64) * 100.;
let pct = (*avail as f64) / (*total as f64) * 100.; let avail = bytesize::ByteSize::b(*avail);
let avail = bytesize::ByteSize::b(*avail); format!("{} ({:.1}%)", avail, pct)
format!("{} ({:.1}%)", avail, pct) }
} None => "?".into(),
None => "?".into(), };
}; healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}",
id = adv.id,
host = adv.status.hostname,
addr = adv.addr,
tags = cfg.tags.join(","),
zone = cfg.zone,
capacity = cfg.capacity_string(),
data_avail = data_avail,
));
} else {
let prev_role = layout
.versions
.iter()
.rev()
.find_map(|x| match x.roles.get(&adv.id) {
Some(NodeRoleV(Some(cfg))) => Some(cfg),
_ => None,
});
let historic_role =
layout
.old_versions
.iter()
.rev()
.find_map(|x| match x.roles.get(&adv.id) {
Some(NodeRoleV(Some(cfg))) => Some(cfg),
_ => None,
});
if let Some(cfg) = prev_role {
healthy_nodes.push(format!( healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...",
id = adv.id, id = adv.id,
host = adv.status.hostname, host = adv.status.hostname,
addr = adv.addr, addr = adv.addr,
tags = cfg.tags.join(","), tags = cfg.tags.join(","),
zone = cfg.zone, zone = cfg.zone,
capacity = cfg.capacity_string(),
data_avail = data_avail,
)); ));
} } else if let Some(cfg) = historic_role {
_ => { healthy_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tremoved, metadata drained",
id = adv.id,
host = adv.status.hostname,
addr = adv.addr,
tags = cfg.tags.join(","),
zone = cfg.zone,
));
} else {
let new_role = match layout.staging.get().roles.get(&adv.id) { let new_role = match layout.staging.get().roles.get(&adv.id) {
Some(NodeRoleV(Some(_))) => "(pending)", Some(NodeRoleV(Some(_))) => "pending...",
_ => "NO ROLE ASSIGNED", _ => "NO ROLE ASSIGNED",
}; };
healthy_nodes.push(format!( healthy_nodes.push(format!(
"{id:?}\t{h}\t{addr}\t{new_role}", "{id:?}\t{h}\t{addr}\t\t\t{new_role}",
id = adv.id, id = adv.id,
h = adv.status.hostname, h = adv.status.hostname,
addr = adv.addr, addr = adv.addr,
@ -101,55 +135,65 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
} }
format_table(healthy_nodes); format_table(healthy_nodes);
let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>(); // Determine which nodes are unhealthy and print that to stdout
let failure_case_1 = status.iter().any(|adv| { let status_map = status
!adv.is_up
&& matches!(
layout.current().roles.get(&adv.id),
Some(NodeRoleV(Some(_)))
)
});
let failure_case_2 = layout
.current()
.roles
.items()
.iter() .iter()
.any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some()); .map(|adv| (adv.id, adv))
if failure_case_1 || failure_case_2 { .collect::<HashMap<_, _>>();
println!("\n==== FAILED NODES ====");
let mut failed_nodes = let tf = timeago::Formatter::new();
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; let mut failed_nodes =
for adv in status.iter().filter(|adv| !adv.is_up) { vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()];
if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let mut listed = HashSet::new();
let tf = timeago::Formatter::new(); for ver in layout.versions.iter().rev() {
failed_nodes.push(format!( for (node, _, role) in ver.roles.items().iter() {
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", let cfg = match role {
id = adv.id, NodeRoleV(Some(role)) if role.capacity.is_some() => role,
host = adv.status.hostname, _ => continue,
addr = adv.addr, };
tags = cfg.tags.join(","),
zone = cfg.zone, if listed.contains(node) {
capacity = cfg.capacity_string(), continue;
last_seen = adv }
.last_seen_secs_ago listed.insert(*node);
let adv = status_map.get(node);
if adv.map(|x| x.is_up).unwrap_or(false) {
continue;
}
// Node is in a layout version, is not a gateway node, and is not up:
// it is in a failed state, add proper line to the output
let (host, addr, last_seen) = match adv {
Some(adv) => (
adv.status.hostname.as_str(),
adv.addr.to_string(),
adv.last_seen_secs_ago
.map(|s| tf.convert(Duration::from_secs(s))) .map(|s| tf.convert(Duration::from_secs(s)))
.unwrap_or_else(|| "never seen".into()), .unwrap_or_else(|| "never seen".into()),
)); ),
} None => ("??", "??".into(), "never seen".into()),
} };
for (id, _, role_v) in layout.current().roles.items().iter() { let capacity = if ver.version == layout.current().version {
if let NodeRoleV(Some(cfg)) = role_v { cfg.capacity_string()
if !status_keys.contains(id) { } else {
failed_nodes.push(format!( "draining metadata...".to_string()
"{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen", };
id = id, failed_nodes.push(format!(
tags = cfg.tags.join(","), "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
zone = cfg.zone, id = node,
capacity = cfg.capacity_string(), host = host,
)); addr = addr,
} tags = cfg.tags.join(","),
} zone = cfg.zone,
capacity = capacity,
last_seen = last_seen,
));
} }
}
if failed_nodes.len() > 1 {
println!("\n==== FAILED NODES ====");
format_table(failed_nodes); format_table(failed_nodes);
} }