Make fsync an option for meta and data
This commit is contained in:
parent
1e466b11eb
commit
e7e164a280
@ -10,6 +10,8 @@ Here is an example `garage.toml` configuration file that illustrates all of the
|
||||
```toml
|
||||
metadata_dir = "/var/lib/garage/meta"
|
||||
data_dir = "/var/lib/garage/data"
|
||||
metadata_fsync = true
|
||||
data_fsync = false
|
||||
|
||||
db_engine = "lmdb"
|
||||
|
||||
@ -124,6 +126,49 @@ convert-db -a <input db engine> -i <input db path> \
|
||||
Make sure to specify the full database path as presented in the table above,
|
||||
and not just the path to the metadata directory.
|
||||
|
||||
### `metadata_fsync`
|
||||
|
||||
Whether to enable synchronous mode for the database engine or not.
|
||||
This is disabled (`false`) by default.
|
||||
|
||||
This reduces the risk of metadata corruption in case of power failures,
|
||||
at the cost of a significant drop in write performance,
|
||||
as Garage will have to pause to sync data to disk much more often
|
||||
(several times for API calls such as PutObject).
|
||||
|
||||
Using this option reduces the risk of simultaneous metadata corruption on several
|
||||
cluster nodes, which could lead to data loss.
|
||||
|
||||
If multi-site replication is used, this option is most likely not necessary, as
|
||||
it is extremely unlikely that two nodes in different locations will have a
|
||||
power failure at the exact same time.
|
||||
|
||||
(Metadata corruption on a single node is not an issue, the corrupted data file
|
||||
can always be deleted and reconstructed from the other nodes in the cluster.)
|
||||
|
||||
Here is how this option impacts the different database engines:
|
||||
|
||||
| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` |
|
||||
|----------|------------------------------------|-------------------------------|
|
||||
| Sled | default options | *unsupported* |
|
||||
| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` |
|
||||
| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` |
|
||||
|
||||
Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`).
|
||||
|
||||
### `data_fsync`
|
||||
|
||||
Whether to `fsync` data blocks and their containing directory after they are
|
||||
saved to disk.
|
||||
This is disabled (`false`) by default.
|
||||
|
||||
This might reduce the risk that a data block is lost in rare
|
||||
situations such as simultaneous node losing power,
|
||||
at the cost of a moderate drop in write performance.
|
||||
|
||||
Similarly to `metatada_fsync`, this is likely not necessary
|
||||
if geographical replication is used.
|
||||
|
||||
### `block_size`
|
||||
|
||||
Garage splits stored objects in consecutive chunks of size `block_size`
|
||||
|
@ -80,6 +80,7 @@ pub struct BlockManager {
|
||||
/// Directory in which block are stored
|
||||
pub data_dir: PathBuf,
|
||||
|
||||
data_fsync: bool,
|
||||
compression_level: Option<i32>,
|
||||
|
||||
mutation_lock: [Mutex<BlockManagerLocked>; 256],
|
||||
@ -114,6 +115,7 @@ impl BlockManager {
|
||||
pub fn new(
|
||||
db: &db::Db,
|
||||
data_dir: PathBuf,
|
||||
data_fsync: bool,
|
||||
compression_level: Option<i32>,
|
||||
replication: TableShardedReplication,
|
||||
system: Arc<System>,
|
||||
@ -141,6 +143,7 @@ impl BlockManager {
|
||||
let block_manager = Arc::new(Self {
|
||||
replication,
|
||||
data_dir,
|
||||
data_fsync,
|
||||
compression_level,
|
||||
mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())),
|
||||
rc,
|
||||
@ -713,7 +716,11 @@ impl BlockManagerLocked {
|
||||
|
||||
let mut f = fs::File::create(&path_tmp).await?;
|
||||
f.write_all(data).await?;
|
||||
|
||||
if mgr.data_fsync {
|
||||
f.sync_all().await?;
|
||||
}
|
||||
|
||||
drop(f);
|
||||
|
||||
fs::rename(path_tmp, path).await?;
|
||||
@ -724,6 +731,7 @@ impl BlockManagerLocked {
|
||||
fs::remove_file(to_delete).await?;
|
||||
}
|
||||
|
||||
if mgr.data_fsync {
|
||||
// We want to ensure that when this function returns, data is properly persisted
|
||||
// to disk. The first step is the sync_all above that does an fsync on the data file.
|
||||
// Now, we do an fsync on the containing directory, to ensure that the rename
|
||||
@ -736,6 +744,7 @@ impl BlockManagerLocked {
|
||||
.await?;
|
||||
dir.sync_all().await?;
|
||||
drop(dir);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -91,6 +91,11 @@ impl Garage {
|
||||
// ---- Sled DB ----
|
||||
#[cfg(feature = "sled")]
|
||||
"sled" => {
|
||||
if config.metadata_fsync {
|
||||
return Err(Error::Message(format!(
|
||||
"`metadata_fsync = true` is not supported with the Sled database engine"
|
||||
)));
|
||||
}
|
||||
db_path.push("db");
|
||||
info!("Opening Sled database at: {}", db_path.display());
|
||||
let db = db::sled_adapter::sled::Config::default()
|
||||
@ -111,7 +116,11 @@ impl Garage {
|
||||
let db = db::sqlite_adapter::rusqlite::Connection::open(db_path)
|
||||
.and_then(|db| {
|
||||
db.pragma_update(None, "journal_mode", &"WAL")?;
|
||||
if config.metadata_fsync {
|
||||
db.pragma_update(None, "synchronous", &"NORMAL")?;
|
||||
} else {
|
||||
db.pragma_update(None, "synchronous", &"OFF")?;
|
||||
}
|
||||
Ok(db)
|
||||
})
|
||||
.ok_or_message("Unable to open sqlite DB")?;
|
||||
@ -139,6 +148,9 @@ impl Garage {
|
||||
env_builder.map_size(map_size);
|
||||
unsafe {
|
||||
env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
|
||||
if !config.metadata_fsync {
|
||||
env_builder.flag(heed::flags::Flags::MdbNoSync);
|
||||
}
|
||||
}
|
||||
let db = match env_builder.open(&db_path) {
|
||||
Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
|
||||
@ -208,6 +220,7 @@ impl Garage {
|
||||
let block_manager = BlockManager::new(
|
||||
&db,
|
||||
config.data_dir.clone(),
|
||||
config.data_fsync,
|
||||
config.compression_level,
|
||||
data_rep_param,
|
||||
system.clone(),
|
||||
|
@ -15,6 +15,13 @@ pub struct Config {
|
||||
/// Path where to store data. Can be slower, but need higher volume
|
||||
pub data_dir: PathBuf,
|
||||
|
||||
/// Whether to fsync after all metadata transactions (disabled by default)
|
||||
#[serde(default)]
|
||||
pub metadata_fsync: bool,
|
||||
/// Whether to fsync after all data block writes (disabled by default)
|
||||
#[serde(default)]
|
||||
pub data_fsync: bool,
|
||||
|
||||
/// Size of data blocks to save to disk
|
||||
#[serde(default = "default_block_size")]
|
||||
pub block_size: usize,
|
||||
|
Loading…
Reference in New Issue
Block a user