[pbs-devel] [PATCH proxmox-backup 4/9] fix #5982: garbage collection: check atime updates are honored
Fabian Grünbichler
f.gruenbichler at proxmox.com
Mon Mar 3 14:51:24 CET 2025
On February 19, 2025 5:48 pm, Christian Ebner wrote:
> Check if the filesystem backing the chunk store actually updates the
> atime to avoid potential data loss in phase 2 of garbage collection
> in case the atime update is not honored.
>
> Perform the check before phase 1 of garbage collection, as well as
> on datastore creation. The latter to early detect and disallow
> datastore creation on filesystem configurations which otherwise most
> likely would lead to data losses.
>
> Enable the atime update check by default, but allow to opt-out by
> setting a datastore tuning parameter flag for backwards compatibility.
> This is honored by both, garbage collection and datastore creation.
>
> The check uses a 4 MiB fixed sized, unencypted and compressed chunk
> as test marker, inserted if not present. This all zero-chunk is very
> likely anyways for unencrypted backup contents with large all-zero
> regions using fixed size chunking (e.g. VMs).
>
> The check sets the chunk's atime twice, once to set it to 1 second
> into the past, and once to set it to now. By this one can detect if
> the atime is set at all and if it also set if the filesystem is e.g.
> mounted via relatime, which is intended to deferr the atime update.
that's not how relatime works? see below..
>
> Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=5982
> Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
> ---
> changes since version 1:
> - Use all-zero fixed size chunk for atime update test
> - Test on datastore creation and before garbage collection phase 1
> - fail datastore creation and garbage collection jobs if check fails,
> but keep opt-out
>
> pbs-datastore/src/chunk_store.rs | 80 ++++++++++++++++++++++++++++++--
> pbs-datastore/src/datastore.rs | 10 ++++
> src/api2/config/datastore.rs | 1 +
> 3 files changed, 86 insertions(+), 5 deletions(-)
>
> diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs
> index 7bdcb0297..be57e3c87 100644
> --- a/pbs-datastore/src/chunk_store.rs
> +++ b/pbs-datastore/src/chunk_store.rs
> @@ -1,6 +1,7 @@
> use std::os::unix::io::AsRawFd;
> use std::path::{Path, PathBuf};
> use std::sync::{Arc, Mutex};
> +use std::time::{Duration, SystemTime, UNIX_EPOCH};
>
> use anyhow::{bail, format_err, Error};
> use tracing::info;
> @@ -13,6 +14,7 @@ use proxmox_sys::process_locker::{
> };
> use proxmox_worker_task::WorkerTaskContext;
>
> +use crate::data_blob::DataChunkBuilder;
> use crate::file_formats::{
> COMPRESSED_BLOB_MAGIC_1_0, ENCRYPTED_BLOB_MAGIC_1_0, UNCOMPRESSED_BLOB_MAGIC_1_0,
> };
> @@ -96,6 +98,7 @@ impl ChunkStore {
> uid: nix::unistd::Uid,
> gid: nix::unistd::Gid,
> sync_level: DatastoreFSyncLevel,
> + atime_update_check: bool,
> ) -> Result<Self, Error>
> where
> P: Into<PathBuf>,
> @@ -150,7 +153,16 @@ impl ChunkStore {
> }
> }
>
> - Self::open(name, base, sync_level)
> + let chunk_store = Self::open(name, base, sync_level)?;
> + if atime_update_check {
> + chunk_store
> + .check_fs_atime_update()
> + .map(|()| info!("atime update check successful."))?;
> + } else {
> + info!("atime update check skipped.");
> + }
> +
> + Ok(chunk_store)
> }
>
> fn lockfile_path<P: Into<PathBuf>>(base: P) -> PathBuf {
> @@ -445,6 +457,51 @@ impl ChunkStore {
> Ok(())
> }
>
> + /// Check if atime updates are honored by the filesystem backing the chunk store.
> + ///
> + /// Sets the atime twice, once into the past by one second, checking that this was honored and
> + /// then to now again, as otherwise it is not guaranteed to also cover time ranges by `relatime`.
> + /// Uses a 4 MiB fixed size, compressed but unencrypted chunk to test. The chunk is inserted in
> + /// the chunk store if not yet present.
> + pub fn check_fs_atime_update(&self) -> Result<(), Error> {
> + let (zero_chunk, digest) = DataChunkBuilder::build_zero_chunk(None, 4096 * 1024, true)?;
> + self.insert_chunk(&zero_chunk, &digest)?;
> +
> + let past = SystemTime::now().duration_since(UNIX_EPOCH)? - Duration::from_secs(1);
> + let times: [libc::timespec; 2] = [
> + libc::timespec {
> + tv_sec: past.as_secs().try_into()?,
> + tv_nsec: past.subsec_nanos().into(),
> + },
> + libc::timespec {
> + tv_sec: 0,
> + tv_nsec: UTIME_OMIT,
> + },
> + ];
this sets *atime* one second in the past, while not touching *mtime*.
this is not something we ever need? if this fails, something is weird,
but it wouldn't mean that atime handling was broken for our use case..
> +
> + use nix::NixPath;
> + let (path, _digest) = self.chunk_path(&digest);
> + path.with_nix_path(|cstr| unsafe {
> + let tmp = libc::utimensat(-1, cstr.as_ptr(), ×[0], libc::AT_SYMLINK_NOFOLLOW);
> + nix::errno::Errno::result(tmp)
> + })??;
> +
> + let metadata_past = std::fs::metadata(&path).map_err(Error::from)?;
> + let atime_past = metadata_past.accessed()?;
> + if past != atime_past.duration_since(UNIX_EPOCH)? {
> + bail!("setting atime to the past failed, is atime support enabled on datastore backing filesystem?");
> + }
> +
> + self.cond_touch_chunk(&digest, true)?;
> +
> + let metadata_now = std::fs::metadata(&path).map_err(Error::from)?;
> + let atime_now = metadata_now.accessed()?;
> + if atime_past >= atime_now {
> + bail!("atime update check failed, is atime support enabled on datastore backing filesystem?");
> + }
if something implements explicit timestamp updates to work like relatime
does for read-triggered updates, this would still potentially fail -
relatime covers checking *mtime* (which we haven't updated above!) first
to see if *atime* should be updated when doing reads. it's basically a
performance optimization (skip most atime updates for read-heavy work
loads) without breaking a very specific use case (that of mutt abusing
atime to know when it last read a local mailbox[0], which would break if
atime were never updated on reads).
I think what we actually want to do at GC time is:
(potentially) insert the chunk
stat the chunk
touch the chunk (regularly)
stat the chunk again
compare the timestamps
we now have three cases:
A) the old atime is before the new atime => the storage doesn't discard
*all* atime updates and we can proceed
B) the old atime and the new atime are identical and atime is *before*
mtime and ctime => atime should have been updated even under
"relatime-like" semantics, very likely this storage is discarding all
atime updates, we should abort with an error
C) the old atime and the new atime are identical, but atime is *after*
ctime/mtime => we need to check whether we just ran into relatime-like
behaviour
C means we either abort as well (if we don't want to support such
setups), or we try to detect if setting both mtime and atime works, and
force the GC into doing that
we might want to add a "wait a second" between the first stat and the
touching to account for filesystems with more granular timestamps
(inserting touches, and other tasks might touch at "the same time" as
well by accident), since the requested timestamps might be modified to
become "the greatest value supported by the filesystem that is not
greater than the specified time" (utimensat(2)), and one second doesn't
really hurt is in the scope of a GC..
note that the 24h grace period is totally independent of this - it comes
from lazytime and relates to how often changes of timestamps are
*persisted on disk*. that shouldn't affect us for local storages (since
we talk to the kernel and always get the updated view anyway, no matter
if it is in-memory or on-disk. if the PBS system crashes GC is gone as
well and needs to start over, no risk of inconsistency).
we might have a problem if the storage uses lazytime internally and
lives somewhere else
- GC starts phase1
- updates a few atimes
- storage crashes and loses some of the updates while I/O to it blocks
- storage comes back up
- GC continues not knowing that some of the atime updates have been lost
in the ether
- GC enters phase2
- chunk timestamp makes it eligible for deletion, even though it was
touched in phase1
- data loss
I don't think we can really prevent this other than by ditching the
whole atime mechanism and instead keep a list of written chunks in each
backup writer, and generate such a list in phase1, merging them all and
using that for phase2 (which is really expensive as it scales with the
number of touched chunks and would require syncing up writers and GC).
thankfully I'd expect most such failure scenarios to actually abort the
GC anyway for I/O error reasons..
at datastore creation we could do more in-depth checks (including
"faking" past timestamps and seeing if updates are honored for those if
they are not honored when doing the simple tests) - if we think we gain
something from that?
0: http://www.mutt.org/doc/manual/#new-mail-formats
> + Ok(())
> + }
> +
> pub fn insert_chunk(&self, chunk: &DataBlob, digest: &[u8; 32]) -> Result<(bool, u64), Error> {
> // unwrap: only `None` in unit tests
> assert!(self.locker.is_some());
> @@ -631,8 +688,15 @@ fn test_chunk_store1() {
> let user = nix::unistd::User::from_uid(nix::unistd::Uid::current())
> .unwrap()
> .unwrap();
> - let chunk_store =
> - ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None).unwrap();
> + let chunk_store = ChunkStore::create(
> + "test",
> + &path,
> + user.uid,
> + user.gid,
> + DatastoreFSyncLevel::None,
> + true,
> + )
> + .unwrap();
>
> let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8])
> .build()
> @@ -644,8 +708,14 @@ fn test_chunk_store1() {
> let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap();
> assert!(exists);
>
> - let chunk_store =
> - ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None);
> + let chunk_store = ChunkStore::create(
> + "test",
> + &path,
> + user.uid,
> + user.gid,
> + DatastoreFSyncLevel::None,
> + true,
> + );
> assert!(chunk_store.is_err());
>
> if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
> index 75c0c16ab..9b01790cc 100644
> --- a/pbs-datastore/src/datastore.rs
> +++ b/pbs-datastore/src/datastore.rs
> @@ -1170,6 +1170,16 @@ impl DataStore {
> upid: Some(upid.to_string()),
> ..Default::default()
> };
> + let tuning: DatastoreTuning = serde_json::from_value(
> + DatastoreTuning::API_SCHEMA
> + .parse_property_string(gc_store_config.tuning.as_deref().unwrap_or(""))?,
> + )?;
> + if tuning.gc_atime_check.unwrap_or(true) {
> + self.inner.chunk_store.check_fs_atime_update()?;
> + info!("Filesystem atime update check successful.");
> + } else {
> + info!("Filesystem atime update check disabled by datastore tuning options.");
> + }
>
> info!("Start GC phase1 (mark used chunks)");
>
> diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs
> index fe3260f6d..1cd38f2db 100644
> --- a/src/api2/config/datastore.rs
> +++ b/src/api2/config/datastore.rs
> @@ -119,6 +119,7 @@ pub(crate) fn do_create_datastore(
> backup_user.uid,
> backup_user.gid,
> tuning.sync_level.unwrap_or_default(),
> + tuning.gc_atime_check.unwrap_or(true),
> )
> .map(|_| ())
> } else {
> --
> 2.39.5
>
>
>
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
>
>
>
More information about the pbs-devel
mailing list