Refactor block resync loop; make workers infaillible

This commit is contained in:
Alex Auvolat 2021-03-15 20:09:44 +01:00
parent 667e4e72a8
commit 4d4117f2b4
7 changed files with 49 additions and 47 deletions

View File

@ -20,6 +20,16 @@ impl Repair {
&self, &self,
opt: RepairOpt, opt: RepairOpt,
must_exit: watch::Receiver<bool>, must_exit: watch::Receiver<bool>,
) {
if let Err(e) = self.repair_worker_aux(opt, must_exit).await {
warn!("Repair worker failed with error: {}", e);
}
}
async fn repair_worker_aux(
&self,
opt: RepairOpt,
must_exit: watch::Receiver<bool>,
) -> Result<(), Error> { ) -> Result<(), Error> {
let todo = |x| opt.what.as_ref().map(|y| *y == x).unwrap_or(true); let todo = |x| opt.what.as_ref().map(|y| *y == x).unwrap_or(true);

View File

@ -258,28 +258,31 @@ impl BlockManager {
async fn resync_loop( async fn resync_loop(
self: Arc<Self>, self: Arc<Self>,
mut must_exit: watch::Receiver<bool>, mut must_exit: watch::Receiver<bool>,
) -> Result<(), Error> { ) {
let mut n_failures = 0usize;
while !*must_exit.borrow() { while !*must_exit.borrow() {
if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? { if let Err(e) = self.resync_iter(&mut must_exit).await {
warn!("Error in block resync loop: {}", e);
tokio::time::delay_for(Duration::from_secs(10)).await;
}
}
}
async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<(), Error> {
if let Some(first_item) = self.resync_queue.iter().next() {
let (time_bytes, hash_bytes) = first_item?;
let time_msec = u64_from_be_bytes(&time_bytes[0..8]); let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
let now = now_msec(); let now = now_msec();
if now >= time_msec { if now >= time_msec {
let hash = Hash::try_from(&hash_bytes[..]).unwrap(); let hash = Hash::try_from(&hash_bytes[..]).unwrap();
let res = self.resync_block(&hash).await;
if let Err(e) = self.resync_iter(&hash).await { if let Err(e) = &res {
warn!("Failed to resync block {:?}, retrying later: {}", hash, e); warn!("Error when resyncing {:?}: {}", hash, e);
self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?; self.put_to_resync(&hash, RESYNC_RETRY_TIMEOUT)?;
n_failures += 1;
if n_failures >= 10 {
warn!("Too many resync failures, throttling.");
tokio::time::delay_for(Duration::from_secs(1)).await;
} }
self.resync_queue.remove(&time_bytes)?;
res?; // propagate error to delay main loop
} else { } else {
n_failures = 0;
}
} else {
self.resync_queue.insert(time_bytes, hash_bytes)?;
let delay = tokio::time::delay_for(Duration::from_millis(time_msec - now)); let delay = tokio::time::delay_for(Duration::from_millis(time_msec - now));
select! { select! {
_ = delay.fuse() => (), _ = delay.fuse() => (),
@ -293,11 +296,10 @@ impl BlockManager {
_ = must_exit.recv().fuse() => (), _ = must_exit.recv().fuse() => (),
} }
} }
}
Ok(()) Ok(())
} }
async fn resync_iter(&self, hash: &Hash) -> Result<(), Error> { async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
let lock = self.data_dir_lock.lock().await; let lock = self.data_dir_lock.lock().await;
let path = self.block_path(hash); let path = self.block_path(hash);

View File

@ -318,9 +318,7 @@ impl System {
let self2 = self.clone(); let self2 = self.clone();
self.clone() self.clone()
.background .background
.spawn_worker(format!("ping loop"), |stop_signal| { .spawn_worker(format!("ping loop"), |stop_signal| self2.ping_loop(stop_signal));
self2.ping_loop(stop_signal).map(Ok)
});
if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) { if let (Some(consul_host), Some(consul_service_name)) = (consul_host, consul_service_name) {
let self2 = self.clone(); let self2 = self.clone();
@ -329,7 +327,6 @@ impl System {
.spawn_worker(format!("Consul loop"), |stop_signal| { .spawn_worker(format!("Consul loop"), |stop_signal| {
self2 self2
.consul_loop(stop_signal, consul_host, consul_service_name) .consul_loop(stop_signal, consul_host, consul_service_name)
.map(Ok)
}); });
} }
} }

View File

@ -70,7 +70,7 @@ where
gc gc
} }
async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) -> Result<(), Error> { async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
while !*must_exit.borrow() { while !*must_exit.borrow() {
match self.gc_loop_iter().await { match self.gc_loop_iter().await {
Ok(true) => { Ok(true) => {
@ -89,7 +89,6 @@ where
_ = must_exit.recv().fuse() => (), _ = must_exit.recv().fuse() => (),
} }
} }
Ok(())
} }
async fn gc_loop_iter(&self) -> Result<bool, Error> { async fn gc_loop_iter(&self) -> Result<bool, Error> {

View File

@ -104,7 +104,7 @@ impl MerkleUpdater {
async fn updater_loop( async fn updater_loop(
self: Arc<Self>, self: Arc<Self>,
mut must_exit: watch::Receiver<bool>, mut must_exit: watch::Receiver<bool>,
) -> Result<(), Error> { ) {
while !*must_exit.borrow() { while !*must_exit.borrow() {
if let Some(x) = self.todo.iter().next() { if let Some(x) = self.todo.iter().next() {
match x { match x {
@ -131,7 +131,6 @@ impl MerkleUpdater {
} }
} }
} }
Ok(())
} }
fn update_item(&self, k: &[u8], vhash_by: &[u8]) -> Result<(), Error> { fn update_item(&self, k: &[u8], vhash_by: &[u8]) -> Result<(), Error> {

View File

@ -136,7 +136,7 @@ where
self: Arc<Self>, self: Arc<Self>,
mut must_exit: watch::Receiver<bool>, mut must_exit: watch::Receiver<bool>,
mut busy_rx: mpsc::UnboundedReceiver<bool>, mut busy_rx: mpsc::UnboundedReceiver<bool>,
) -> Result<(), Error> { ) {
let mut prev_ring: Arc<Ring> = self.aux.system.ring.borrow().clone(); let mut prev_ring: Arc<Ring> = self.aux.system.ring.borrow().clone();
let mut ring_recv: watch::Receiver<Arc<Ring>> = self.aux.system.ring.clone(); let mut ring_recv: watch::Receiver<Arc<Ring>> = self.aux.system.ring.clone();
let mut nothing_to_do_since = Some(Instant::now()); let mut nothing_to_do_since = Some(Instant::now());
@ -183,7 +183,6 @@ where
} }
} }
} }
Ok(())
} }
pub fn add_full_sync(&self) { pub fn add_full_sync(&self) {
@ -197,11 +196,11 @@ where
self: Arc<Self>, self: Arc<Self>,
mut must_exit: watch::Receiver<bool>, mut must_exit: watch::Receiver<bool>,
busy_tx: mpsc::UnboundedSender<bool>, busy_tx: mpsc::UnboundedSender<bool>,
) -> Result<(), Error> { ) {
while !*must_exit.borrow() { while !*must_exit.borrow() {
let task = self.todo.lock().unwrap().pop_task(); let task = self.todo.lock().unwrap().pop_task();
if let Some(partition) = task { if let Some(partition) = task {
busy_tx.send(true)?; busy_tx.send(true).unwrap();
let res = self let res = self
.clone() .clone()
.sync_partition(&partition, &mut must_exit) .sync_partition(&partition, &mut must_exit)
@ -213,11 +212,10 @@ where
); );
} }
} else { } else {
busy_tx.send(false)?; busy_tx.send(false).unwrap();
tokio::time::delay_for(Duration::from_secs(1)).await; tokio::time::delay_for(Duration::from_secs(1)).await;
} }
} }
Ok(())
} }
async fn sync_partition( async fn sync_partition(

View File

@ -76,16 +76,13 @@ impl BackgroundRunner {
pub fn spawn_worker<F, T>(&self, name: String, worker: F) pub fn spawn_worker<F, T>(&self, name: String, worker: F)
where where
F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static, F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static,
T: Future<Output = JobOutput> + Send + 'static, T: Future<Output = ()> + Send + 'static,
{ {
let mut workers = self.workers.lock().unwrap(); let mut workers = self.workers.lock().unwrap();
let stop_signal = self.stop_signal.clone(); let stop_signal = self.stop_signal.clone();
workers.push(tokio::spawn(async move { workers.push(tokio::spawn(async move {
if let Err(e) = worker(stop_signal).await { worker(stop_signal).await;
error!("Worker stopped with error: {}, error: {}", name, e); info!("Worker exited: {}", name);
} else {
info!("Worker exited successfully: {}", name);
}
})); }));
} }