Simplified and more aggressive worker exit logic

This commit is contained in:
Alex Auvolat 2022-12-14 15:25:29 +01:00
parent d4af27f920
commit dfc131850a
No known key found for this signature in database
GPG Key ID: 0E496D15096376BE
8 changed files with 32 additions and 71 deletions

View File

@ -148,7 +148,7 @@ impl Worker for RepairWorker {
} }
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
unreachable!() unreachable!()
} }
} }
@ -341,7 +341,7 @@ impl Worker for ScrubWorker {
} }
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
let (wait_until, command) = match &self.work { let (wait_until, command) = match &self.work {
ScrubWorkerState::Running(_) => return WorkerState::Busy, ScrubWorkerState::Running(_) => return WorkerState::Busy,
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume), ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),

View File

@ -540,7 +540,7 @@ impl Worker for ResyncWorker {
} }
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
while self.index >= self.manager.resync.persisted.load().n_workers { while self.index >= self.manager.resync.persisted.load().n_workers {
self.manager.resync.notify.notified().await self.manager.resync.notify.notified().await
} }

View File

@ -136,7 +136,7 @@ impl Worker for RepairVersionsWorker {
Ok(WorkerState::Busy) Ok(WorkerState::Busy)
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
unreachable!() unreachable!()
} }
} }
@ -214,7 +214,7 @@ impl Worker for RepairBlockrefsWorker {
Ok(WorkerState::Busy) Ok(WorkerState::Busy)
} }
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
unreachable!() unreachable!()
} }
} }

View File

@ -348,10 +348,7 @@ where
} }
} }
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
tokio::time::sleep(self.wait_delay).await; tokio::time::sleep(self.wait_delay).await;
WorkerState::Busy WorkerState::Busy
} }

View File

@ -340,10 +340,7 @@ where
.unwrap() .unwrap()
} }
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
select! { select! {
_ = tokio::time::sleep(Duration::from_secs(60)) => (), _ = tokio::time::sleep(Duration::from_secs(60)) => (),
_ = self.0.data.merkle_todo_notify.notified() => (), _ = self.0.data.merkle_todo_notify.notified() => (),

View File

@ -71,10 +71,7 @@ where
Ok(WorkerState::Busy) Ok(WorkerState::Busy)
} }
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
select! { select! {
_ = tokio::time::sleep(Duration::from_secs(600)) => (), _ = tokio::time::sleep(Duration::from_secs(600)) => (),
_ = self.0.data.insert_queue_notify.notified() => (), _ = self.0.data.insert_queue_notify.notified() => (),

View File

@ -593,10 +593,7 @@ impl<F: TableSchema + 'static, R: TableReplication + 'static> Worker for SyncWor
} }
} }
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
select! { select! {
s = self.add_full_sync_rx.recv() => { s = self.add_full_sync_rx.recv() => {
if let Some(()) = s { if let Some(()) = s {

View File

@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::Duration;
use async_trait::async_trait; use async_trait::async_trait;
use futures::future::*; use futures::future::*;
@ -14,6 +14,10 @@ use crate::background::{WorkerInfo, WorkerStatus};
use crate::error::Error; use crate::error::Error;
use crate::time::now_msec; use crate::time::now_msec;
// All workers that haven't exited for this time after an exit signal was recieved
// will be interrupted in the middle of whatever they are doing.
const EXIT_DEADLINE: Duration = Duration::from_secs(8);
#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)] #[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
pub enum WorkerState { pub enum WorkerState {
Busy, Busy,
@ -50,10 +54,8 @@ pub trait Worker: Send {
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>; async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
/// Wait for work: await for some task to become available. This future can be interrupted in /// Wait for work: await for some task to become available. This future can be interrupted in
/// the middle for any reason. This future doesn't have to await on must_exit.changed(), we /// the middle for any reason, for example if an interrupt signal was recieved.
/// are doing it for you. Therefore it only receives a read refernce to must_exit which allows async fn wait_for_work(&mut self) -> WorkerState;
/// it to check if we are exiting.
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState;
} }
pub(crate) struct WorkerProcessor { pub(crate) struct WorkerProcessor {
@ -93,11 +95,9 @@ impl WorkerProcessor {
let task_id = next_task_id; let task_id = next_task_id;
next_task_id += 1; next_task_id += 1;
let stop_signal = self.stop_signal.clone(); let stop_signal = self.stop_signal.clone();
let stop_signal_worker = self.stop_signal.clone();
let mut worker = WorkerHandler { let mut worker = WorkerHandler {
task_id, task_id,
stop_signal, stop_signal,
stop_signal_worker,
worker: new_worker, worker: new_worker,
state: WorkerState::Busy, state: WorkerState::Busy,
errors: 0, errors: 0,
@ -153,26 +153,14 @@ impl WorkerProcessor {
} }
// We are exiting, drain everything // We are exiting, drain everything
let drain_half_time = Instant::now() + Duration::from_secs(5);
let drain_everything = async move { let drain_everything = async move {
while let Some(mut worker) = workers.next().await { while let Some(worker) = workers.next().await {
if worker.state == WorkerState::Done { info!(
info!( "Worker {} (TID {}) exited (last state: {:?})",
"Worker {} (TID {}) exited", worker.worker.name(),
worker.worker.name(), worker.task_id,
worker.task_id worker.state
); );
} else if Instant::now() > drain_half_time {
warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state);
} else {
workers.push(
async move {
worker.step().await;
worker
}
.boxed(),
);
}
} }
}; };
@ -180,7 +168,7 @@ impl WorkerProcessor {
_ = drain_everything => { _ = drain_everything => {
info!("All workers exited peacefully \\o/"); info!("All workers exited peacefully \\o/");
} }
_ = tokio::time::sleep(Duration::from_secs(9)) => { _ = tokio::time::sleep(EXIT_DEADLINE) => {
error!("Some workers could not exit in time, we are cancelling some things in the middle"); error!("Some workers could not exit in time, we are cancelling some things in the middle");
} }
} }
@ -190,7 +178,6 @@ impl WorkerProcessor {
struct WorkerHandler { struct WorkerHandler {
task_id: usize, task_id: usize,
stop_signal: watch::Receiver<bool>, stop_signal: watch::Receiver<bool>,
stop_signal_worker: watch::Receiver<bool>,
worker: Box<dyn Worker>, worker: Box<dyn Worker>,
state: WorkerState, state: WorkerState,
errors: usize, errors: usize,
@ -225,33 +212,19 @@ impl WorkerHandler {
}, },
WorkerState::Throttled(delay) => { WorkerState::Throttled(delay) => {
// Sleep for given delay and go back to busy state // Sleep for given delay and go back to busy state
if !*self.stop_signal.borrow() { select! {
select! { _ = tokio::time::sleep(Duration::from_secs_f32(delay)) => {
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (), self.state = WorkerState::Busy;
_ = self.stop_signal.changed() => (),
} }
_ = self.stop_signal.changed() => (),
} }
self.state = WorkerState::Busy;
} }
WorkerState::Idle => { WorkerState::Idle => {
if *self.stop_signal.borrow() { select! {
select! { new_st = self.worker.wait_for_work() => {
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => { self.state = new_st;
self.state = new_st;
}
_ = tokio::time::sleep(Duration::from_secs(1)) => {
// stay in Idle state
}
}
} else {
select! {
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
self.state = new_st;
}
_ = self.stop_signal.changed() => {
// stay in Idle state
}
} }
_ = self.stop_signal.changed() => (),
} }
} }
WorkerState::Done => unreachable!(), WorkerState::Done => unreachable!(),