Several changes in table_sync:

- separate path for case of offloading a partition we don't store
- use sync::Mutex instead of tokio::Mutex, make less fn's async
This commit is contained in:
Alex Auvolat 2021-02-23 19:11:02 +01:00
parent 40763fd749
commit 55156cca9d
2 changed files with 191 additions and 119 deletions

View File

@ -435,7 +435,7 @@ where
let syncer = self.syncer.load_full().unwrap(); let syncer = self.syncer.load_full().unwrap();
debug!("({}) Deleting range {:?} - {:?}", self.name, begin, end); debug!("({}) Deleting range {:?} - {:?}", self.name, begin, end);
let mut count = 0; let mut count: usize = 0;
while let Some((key, _value)) = self.store.get_lt(end.as_slice())? { while let Some((key, _value)) = self.store.get_lt(end.as_slice())? {
if key.as_ref() < begin.as_slice() { if key.as_ref() < begin.as_slice() {
break; break;

View File

@ -1,15 +1,14 @@
use rand::Rng; use rand::Rng;
use std::collections::{BTreeMap, VecDeque}; use std::collections::{BTreeMap, VecDeque};
use std::sync::Arc; use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use futures::future::BoxFuture; use futures::future::join_all;
use futures::{pin_mut, select}; use futures::{pin_mut, select};
use futures_util::future::*; use futures_util::future::*;
use futures_util::stream::*; use futures_util::stream::*;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf; use serde_bytes::ByteBuf;
use tokio::sync::Mutex;
use tokio::sync::{mpsc, watch}; use tokio::sync::{mpsc, watch};
use garage_rpc::ring::Ring; use garage_rpc::ring::Ring;
@ -33,7 +32,7 @@ pub struct TableSyncer<F: TableSchema, R: TableReplication> {
pub enum SyncRPC { pub enum SyncRPC {
GetRootChecksumRange(Hash, Hash), GetRootChecksumRange(Hash, Hash),
RootChecksumRange(SyncRange), RootChecksumRange(SyncRange),
Checksums(Vec<RangeChecksum>, bool), Checksums(Vec<RangeChecksum>),
Difference(Vec<SyncRange>, Vec<Arc<ByteBuf>>), Difference(Vec<SyncRange>, Vec<Arc<ByteBuf>>),
} }
@ -43,8 +42,11 @@ pub struct SyncTodo {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
struct TodoPartition { struct TodoPartition {
// Partition consists in hashes between begin included and end excluded
begin: Hash, begin: Hash,
end: Hash, end: Hash,
// Are we a node that stores this partition or not?
retain: bool, retain: bool,
} }
@ -161,7 +163,7 @@ where
new_ring_r = s_ring_recv => { new_ring_r = s_ring_recv => {
if let Some(new_ring) = new_ring_r { if let Some(new_ring) = new_ring_r {
debug!("({}) Adding ring difference to syncer todo list", self.table.name); debug!("({}) Adding ring difference to syncer todo list", self.table.name);
self.todo.lock().await.add_ring_difference(&self.table, &prev_ring, &new_ring); self.todo.lock().unwrap().add_ring_difference(&self.table, &prev_ring, &new_ring);
prev_ring = new_ring; prev_ring = new_ring;
} }
} }
@ -194,7 +196,7 @@ where
} }
pub async fn add_full_scan(&self) { pub async fn add_full_scan(&self) {
self.todo.lock().await.add_full_scan(&self.table); self.todo.lock().unwrap().add_full_scan(&self.table);
} }
async fn syncer_task( async fn syncer_task(
@ -203,7 +205,8 @@ where
busy_tx: mpsc::UnboundedSender<bool>, busy_tx: mpsc::UnboundedSender<bool>,
) -> Result<(), Error> { ) -> Result<(), Error> {
while !*must_exit.borrow() { while !*must_exit.borrow() {
if let Some(partition) = self.todo.lock().await.pop_task() { let task = self.todo.lock().unwrap().pop_task();
if let Some(partition) = task {
busy_tx.send(true)?; busy_tx.send(true)?;
let res = self let res = self
.clone() .clone()
@ -228,76 +231,152 @@ where
partition: &TodoPartition, partition: &TodoPartition,
must_exit: &mut watch::Receiver<bool>, must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> { ) -> Result<(), Error> {
let my_id = self.table.system.id; if partition.retain {
let nodes = self let my_id = self.table.system.id;
.table let nodes = self
.replication .table
.write_nodes(&partition.begin, &self.table.system) .replication
.into_iter() .write_nodes(&partition.begin, &self.table.system)
.filter(|node| *node != my_id) .into_iter()
.collect::<Vec<_>>(); .filter(|node| *node != my_id)
.collect::<Vec<_>>();
debug!( debug!(
"({}) Preparing to sync {:?} with {:?}...", "({}) Preparing to sync {:?} with {:?}...",
self.table.name, partition, nodes self.table.name, partition, nodes
); );
let root_cks = self let root_cks = self.root_checksum(&partition.begin, &partition.end, must_exit)?;
.root_checksum(&partition.begin, &partition.end, must_exit)
.await?;
let mut sync_futures = nodes let mut sync_futures = nodes
.iter() .iter()
.map(|node| { .map(|node| {
self.clone().do_sync_with( self.clone().do_sync_with(
partition.clone(), partition.clone(),
root_cks.clone(), root_cks.clone(),
*node, *node,
partition.retain, must_exit.clone(),
must_exit.clone(), )
) })
}) .collect::<FuturesUnordered<_>>();
.collect::<FuturesUnordered<_>>();
let mut n_errors = 0; let mut n_errors = 0;
while let Some(r) = sync_futures.next().await { while let Some(r) = sync_futures.next().await {
if let Err(e) = r { if let Err(e) = r {
n_errors += 1; n_errors += 1;
warn!("({}) Sync error: {}", self.table.name, e); warn!("({}) Sync error: {}", self.table.name, e);
}
} }
} if n_errors > self.table.replication.max_write_errors() {
if n_errors > self.table.replication.max_write_errors() { return Err(Error::Message(format!(
return Err(Error::Message(format!( "Sync failed with too many nodes (should have been: {:?}).",
"Sync failed with too many nodes (should have been: {:?}).", nodes
nodes )));
))); }
} } else {
self.offload_partition(&partition.begin, &partition.end, must_exit)
if !partition.retain {
self.table
.delete_range(&partition.begin, &partition.end)
.await?; .await?;
} }
Ok(()) Ok(())
} }
async fn root_checksum( // Offload partition: this partition is not something we are storing,
// so send it out to all other nodes that store it and delete items locally.
// We don't bother checking if the remote nodes already have the items,
// we just batch-send everything. Offloading isn't supposed to happen very often.
// If any of the nodes that are supposed to store the items is unable to
// save them, we interrupt the process.
async fn offload_partition(
self: &Arc<Self>,
begin: &Hash,
end: &Hash,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
let mut counter: usize = 0;
while !*must_exit.borrow() {
let mut items = Vec::new();
for item in self.table.store.range(begin.to_vec()..end.to_vec()) {
let (key, value) = item?;
items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
if items.len() >= 1024 {
break;
}
}
if items.len() > 0 {
let nodes = self
.table
.replication
.write_nodes(&begin, &self.table.system)
.into_iter()
.collect::<Vec<_>>();
if nodes.contains(&self.table.system.id) {
warn!("Interrupting offload as partitions seem to have changed");
break;
}
counter += 1;
debug!("Offloading items from {:?}..{:?} ({})", begin, end, counter);
self.offload_items(&items, &nodes[..]).await?;
} else {
break;
}
}
Ok(())
}
async fn offload_items(
self: &Arc<Self>,
items: &Vec<(Vec<u8>, Arc<ByteBuf>)>,
nodes: &[UUID],
) -> Result<(), Error> {
let values = items.iter().map(|(_k, v)| v.clone()).collect::<Vec<_>>();
let update_msg = Arc::new(TableRPC::<F>::Update(values));
for res in join_all(nodes.iter().map(|to| {
self.table
.rpc_client
.call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT)
}))
.await
{
res?;
}
// All remote nodes have written those items, now we can delete them locally
for (k, v) in items.iter() {
self.table.store.transaction(|tx_db| {
if let Some(curv) = tx_db.get(k)? {
if curv == &v[..] {
tx_db.remove(&k[..])?;
}
}
Ok(())
})?;
}
Ok(())
}
fn root_checksum(
self: &Arc<Self>, self: &Arc<Self>,
begin: &Hash, begin: &Hash,
end: &Hash, end: &Hash,
must_exit: &mut watch::Receiver<bool>, must_exit: &mut watch::Receiver<bool>,
) -> Result<RangeChecksum, Error> { ) -> Result<RangeChecksum, Error> {
for i in 1..MAX_DEPTH { for i in 1..MAX_DEPTH {
let rc = self let rc = self.range_checksum(
.range_checksum( &SyncRange {
&SyncRange { begin: begin.to_vec(),
begin: begin.to_vec(), end: end.to_vec(),
end: end.to_vec(), level: i,
level: i, },
}, must_exit,
must_exit, )?;
)
.await?;
if rc.found_limit.is_none() { if rc.found_limit.is_none() {
return Ok(rc); return Ok(rc);
} }
@ -307,7 +386,7 @@ where
))) )))
} }
async fn range_checksum( fn range_checksum(
self: &Arc<Self>, self: &Arc<Self>,
range: &SyncRange, range: &SyncRange,
must_exit: &mut watch::Receiver<bool>, must_exit: &mut watch::Receiver<bool>,
@ -357,9 +436,7 @@ where
}; };
let mut time = Instant::now(); let mut time = Instant::now();
while !*must_exit.borrow() { while !*must_exit.borrow() {
let sub_ck = self let sub_ck = self.range_checksum_cached_hash(&sub_range, must_exit)?;
.range_checksum_cached_hash(&sub_range, must_exit)
.await?;
if let Some(hash) = sub_ck.hash { if let Some(hash) = sub_ck.hash {
children.push((sub_range.clone(), hash)); children.push((sub_range.clone(), hash));
@ -397,50 +474,48 @@ where
} }
} }
fn range_checksum_cached_hash<'a>( fn range_checksum_cached_hash(
self: &'a Arc<Self>, self: &Arc<Self>,
range: &'a SyncRange, range: &SyncRange,
must_exit: &'a mut watch::Receiver<bool>, must_exit: &mut watch::Receiver<bool>,
) -> BoxFuture<'a, Result<RangeChecksumCache, Error>> { ) -> Result<RangeChecksumCache, Error> {
async move { {
let mut cache = self.cache[range.level].lock().await; let mut cache = self.cache[range.level].lock().unwrap();
if let Some(v) = cache.get(&range) { if let Some(v) = cache.get(&range) {
if Instant::now() - v.time < CHECKSUM_CACHE_TIMEOUT { if Instant::now() - v.time < CHECKSUM_CACHE_TIMEOUT {
return Ok(v.clone()); return Ok(v.clone());
} }
} }
cache.remove(&range); cache.remove(&range);
drop(cache);
let v = self.range_checksum(&range, must_exit).await?;
trace!(
"({}) New checksum calculated for {}-{}/{}, {} children",
self.table.name,
hex::encode(&range.begin)
.chars()
.take(16)
.collect::<String>(),
hex::encode(&range.end).chars().take(16).collect::<String>(),
range.level,
v.children.len()
);
let hash = if v.children.len() > 0 {
Some(blake2sum(&rmp_to_vec_all_named(&v)?[..]))
} else {
None
};
let cache_entry = RangeChecksumCache {
hash,
found_limit: v.found_limit,
time: v.time,
};
let mut cache = self.cache[range.level].lock().await;
cache.insert(range.clone(), cache_entry.clone());
Ok(cache_entry)
} }
.boxed()
let v = self.range_checksum(&range, must_exit)?;
trace!(
"({}) New checksum calculated for {}-{}/{}, {} children",
self.table.name,
hex::encode(&range.begin)
.chars()
.take(16)
.collect::<String>(),
hex::encode(&range.end).chars().take(16).collect::<String>(),
range.level,
v.children.len()
);
let hash = if v.children.len() > 0 {
Some(blake2sum(&rmp_to_vec_all_named(&v)?[..]))
} else {
None
};
let cache_entry = RangeChecksumCache {
hash,
found_limit: v.found_limit,
time: v.time,
};
let mut cache = self.cache[range.level].lock().unwrap();
cache.insert(range.clone(), cache_entry.clone());
Ok(cache_entry)
} }
async fn do_sync_with( async fn do_sync_with(
@ -448,7 +523,6 @@ where
partition: TodoPartition, partition: TodoPartition,
root_ck: RangeChecksum, root_ck: RangeChecksum,
who: UUID, who: UUID,
retain: bool,
mut must_exit: watch::Receiver<bool>, mut must_exit: watch::Receiver<bool>,
) -> Result<(), Error> { ) -> Result<(), Error> {
let mut todo = VecDeque::new(); let mut todo = VecDeque::new();
@ -468,7 +542,7 @@ where
.await?; .await?;
if let TableRPC::<F>::SyncRPC(SyncRPC::RootChecksumRange(range)) = root_cks_resp { if let TableRPC::<F>::SyncRPC(SyncRPC::RootChecksumRange(range)) = root_cks_resp {
if range.level > root_ck.bounds.level { if range.level > root_ck.bounds.level {
let their_root_range_ck = self.range_checksum(&range, &mut must_exit).await?; let their_root_range_ck = self.range_checksum(&range, &mut must_exit)?;
todo.push_back(their_root_range_ck); todo.push_back(their_root_range_ck);
} else { } else {
todo.push_back(root_ck); todo.push_back(root_ck);
@ -498,7 +572,7 @@ where
.rpc_client .rpc_client
.call( .call(
who, who,
TableRPC::<F>::SyncRPC(SyncRPC::Checksums(step, retain)), TableRPC::<F>::SyncRPC(SyncRPC::Checksums(step)),
TABLE_SYNC_RPC_TIMEOUT, TABLE_SYNC_RPC_TIMEOUT,
) )
.await?; .await?;
@ -519,11 +593,11 @@ where
if differing.level == 0 { if differing.level == 0 {
items_to_send.push(differing.begin); items_to_send.push(differing.begin);
} else { } else {
let checksum = self.range_checksum(&differing, &mut must_exit).await?; let checksum = self.range_checksum(&differing, &mut must_exit)?;
todo.push_back(checksum); todo.push_back(checksum);
} }
} }
if retain && diff_items.len() > 0 { if diff_items.len() > 0 {
self.table.handle_update(&diff_items[..]).await?; self.table.handle_update(&diff_items[..]).await?;
} }
if items_to_send.len() > 0 { if items_to_send.len() > 0 {
@ -575,11 +649,11 @@ where
) -> Result<SyncRPC, Error> { ) -> Result<SyncRPC, Error> {
match message { match message {
SyncRPC::GetRootChecksumRange(begin, end) => { SyncRPC::GetRootChecksumRange(begin, end) => {
let root_cks = self.root_checksum(&begin, &end, &mut must_exit).await?; let root_cks = self.root_checksum(&begin, &end, &mut must_exit)?;
Ok(SyncRPC::RootChecksumRange(root_cks.bounds)) Ok(SyncRPC::RootChecksumRange(root_cks.bounds))
} }
SyncRPC::Checksums(checksums, retain) => { SyncRPC::Checksums(checksums) => {
self.handle_checksums_rpc(&checksums[..], *retain, &mut must_exit) self.handle_checksums_rpc(&checksums[..], &mut must_exit)
.await .await
} }
_ => Err(Error::Message(format!("Unexpected sync RPC"))), _ => Err(Error::Message(format!("Unexpected sync RPC"))),
@ -589,14 +663,13 @@ where
async fn handle_checksums_rpc( async fn handle_checksums_rpc(
self: &Arc<Self>, self: &Arc<Self>,
checksums: &[RangeChecksum], checksums: &[RangeChecksum],
retain: bool,
must_exit: &mut watch::Receiver<bool>, must_exit: &mut watch::Receiver<bool>,
) -> Result<SyncRPC, Error> { ) -> Result<SyncRPC, Error> {
let mut ret_ranges = vec![]; let mut ret_ranges = vec![];
let mut ret_items = vec![]; let mut ret_items = vec![];
for their_ckr in checksums.iter() { for their_ckr in checksums.iter() {
let our_ckr = self.range_checksum(&their_ckr.bounds, must_exit).await?; let our_ckr = self.range_checksum(&their_ckr.bounds, must_exit)?;
for (their_range, their_hash) in their_ckr.children.iter() { for (their_range, their_hash) in their_ckr.children.iter() {
let differs = match our_ckr let differs = match our_ckr
.children .children
@ -604,9 +677,8 @@ where
{ {
Err(_) => { Err(_) => {
if their_range.level >= 1 { if their_range.level >= 1 {
let cached_hash = self let cached_hash =
.range_checksum_cached_hash(&their_range, must_exit) self.range_checksum_cached_hash(&their_range, must_exit)?;
.await?;
cached_hash.hash.map(|h| h != *their_hash).unwrap_or(true) cached_hash.hash.map(|h| h != *their_hash).unwrap_or(true)
} else { } else {
true true
@ -616,7 +688,7 @@ where
}; };
if differs { if differs {
ret_ranges.push(their_range.clone()); ret_ranges.push(their_range.clone());
if retain && their_range.level == 0 { if their_range.level == 0 {
if let Some(item_bytes) = if let Some(item_bytes) =
self.table.store.get(their_range.begin.as_slice())? self.table.store.get(their_range.begin.as_slice())?
{ {
@ -640,7 +712,7 @@ where
if our_range.level > 0 { if our_range.level > 0 {
ret_ranges.push(our_range.clone()); ret_ranges.push(our_range.clone());
} }
if retain && our_range.level == 0 { if our_range.level == 0 {
if let Some(item_bytes) = if let Some(item_bytes) =
self.table.store.get(our_range.begin.as_slice())? self.table.store.get(our_range.begin.as_slice())?
{ {
@ -673,7 +745,7 @@ where
end: vec![], end: vec![],
level: i, level: i,
}; };
let mut cache = self.cache[i].lock().await; let mut cache = self.cache[i].lock().unwrap();
if let Some(cache_entry) = cache.range(..=needle).rev().next() { if let Some(cache_entry) = cache.range(..=needle).rev().next() {
if cache_entry.0.begin <= item_key && cache_entry.0.end > item_key { if cache_entry.0.begin <= item_key && cache_entry.0.end > item_key {
let index = cache_entry.0.clone(); let index = cache_entry.0.clone();