Simplified and more aggressive worker exit logic

This commit is contained in:
Alex Auvolat 2022-12-14 15:25:29 +01:00
parent d4af27f920
commit dfc131850a
No known key found for this signature in database
GPG Key ID: 0E496D15096376BE
8 changed files with 32 additions and 71 deletions

View File

@ -148,7 +148,7 @@ impl Worker for RepairWorker {
}
}
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
async fn wait_for_work(&mut self) -> WorkerState {
unreachable!()
}
}
@ -341,7 +341,7 @@ impl Worker for ScrubWorker {
}
}
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
async fn wait_for_work(&mut self) -> WorkerState {
let (wait_until, command) = match &self.work {
ScrubWorkerState::Running(_) => return WorkerState::Busy,
ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),

View File

@ -540,7 +540,7 @@ impl Worker for ResyncWorker {
}
}
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
async fn wait_for_work(&mut self) -> WorkerState {
while self.index >= self.manager.resync.persisted.load().n_workers {
self.manager.resync.notify.notified().await
}

View File

@ -136,7 +136,7 @@ impl Worker for RepairVersionsWorker {
Ok(WorkerState::Busy)
}
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
async fn wait_for_work(&mut self) -> WorkerState {
unreachable!()
}
}
@ -214,7 +214,7 @@ impl Worker for RepairBlockrefsWorker {
Ok(WorkerState::Busy)
}
async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
async fn wait_for_work(&mut self) -> WorkerState {
unreachable!()
}
}

View File

@ -348,10 +348,7 @@ where
}
}
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
async fn wait_for_work(&mut self) -> WorkerState {
tokio::time::sleep(self.wait_delay).await;
WorkerState::Busy
}

View File

@ -340,10 +340,7 @@ where
.unwrap()
}
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
async fn wait_for_work(&mut self) -> WorkerState {
select! {
_ = tokio::time::sleep(Duration::from_secs(60)) => (),
_ = self.0.data.merkle_todo_notify.notified() => (),

View File

@ -71,10 +71,7 @@ where
Ok(WorkerState::Busy)
}
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
async fn wait_for_work(&mut self) -> WorkerState {
select! {
_ = tokio::time::sleep(Duration::from_secs(600)) => (),
_ = self.0.data.insert_queue_notify.notified() => (),

View File

@ -593,10 +593,7 @@ impl<F: TableSchema + 'static, R: TableReplication + 'static> Worker for SyncWor
}
}
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
if *must_exit.borrow() {
return WorkerState::Done;
}
async fn wait_for_work(&mut self) -> WorkerState {
select! {
s = self.add_full_sync_rx.recv() => {
if let Some(()) = s {

View File

@ -1,6 +1,6 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use std::time::Duration;
use async_trait::async_trait;
use futures::future::*;
@ -14,6 +14,10 @@ use crate::background::{WorkerInfo, WorkerStatus};
use crate::error::Error;
use crate::time::now_msec;
// All workers that haven't exited for this time after an exit signal was recieved
// will be interrupted in the middle of whatever they are doing.
const EXIT_DEADLINE: Duration = Duration::from_secs(8);
#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
pub enum WorkerState {
Busy,
@ -50,10 +54,8 @@ pub trait Worker: Send {
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
/// Wait for work: await for some task to become available. This future can be interrupted in
/// the middle for any reason. This future doesn't have to await on must_exit.changed(), we
/// are doing it for you. Therefore it only receives a read refernce to must_exit which allows
/// it to check if we are exiting.
async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState;
/// the middle for any reason, for example if an interrupt signal was recieved.
async fn wait_for_work(&mut self) -> WorkerState;
}
pub(crate) struct WorkerProcessor {
@ -93,11 +95,9 @@ impl WorkerProcessor {
let task_id = next_task_id;
next_task_id += 1;
let stop_signal = self.stop_signal.clone();
let stop_signal_worker = self.stop_signal.clone();
let mut worker = WorkerHandler {
task_id,
stop_signal,
stop_signal_worker,
worker: new_worker,
state: WorkerState::Busy,
errors: 0,
@ -153,26 +153,14 @@ impl WorkerProcessor {
}
// We are exiting, drain everything
let drain_half_time = Instant::now() + Duration::from_secs(5);
let drain_everything = async move {
while let Some(mut worker) = workers.next().await {
if worker.state == WorkerState::Done {
while let Some(worker) = workers.next().await {
info!(
"Worker {} (TID {}) exited",
"Worker {} (TID {}) exited (last state: {:?})",
worker.worker.name(),
worker.task_id
worker.task_id,
worker.state
);
} else if Instant::now() > drain_half_time {
warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state);
} else {
workers.push(
async move {
worker.step().await;
worker
}
.boxed(),
);
}
}
};
@ -180,7 +168,7 @@ impl WorkerProcessor {
_ = drain_everything => {
info!("All workers exited peacefully \\o/");
}
_ = tokio::time::sleep(Duration::from_secs(9)) => {
_ = tokio::time::sleep(EXIT_DEADLINE) => {
error!("Some workers could not exit in time, we are cancelling some things in the middle");
}
}
@ -190,7 +178,6 @@ impl WorkerProcessor {
struct WorkerHandler {
task_id: usize,
stop_signal: watch::Receiver<bool>,
stop_signal_worker: watch::Receiver<bool>,
worker: Box<dyn Worker>,
state: WorkerState,
errors: usize,
@ -225,33 +212,19 @@ impl WorkerHandler {
},
WorkerState::Throttled(delay) => {
// Sleep for given delay and go back to busy state
if !*self.stop_signal.borrow() {
select! {
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (),
_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => {
self.state = WorkerState::Busy;
}
_ = self.stop_signal.changed() => (),
}
}
self.state = WorkerState::Busy;
}
WorkerState::Idle => {
if *self.stop_signal.borrow() {
select! {
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
new_st = self.worker.wait_for_work() => {
self.state = new_st;
}
_ = tokio::time::sleep(Duration::from_secs(1)) => {
// stay in Idle state
}
}
} else {
select! {
new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
self.state = new_st;
}
_ = self.stop_signal.changed() => {
// stay in Idle state
}
}
_ = self.stop_signal.changed() => (),
}
}
WorkerState::Done => unreachable!(),