[pbs-devel] [PATCH proxmox-backup 4/9] fix #5982: garbage collection: check atime updates are honored

Christian Ebner c.ebner at proxmox.com
Mon Mar 3 16:23:14 CET 2025


On 3/3/25 14:51, Fabian Grünbichler wrote:
> On February 19, 2025 5:48 pm, Christian Ebner wrote:
>> Check if the filesystem backing the chunk store actually updates the
>> atime to avoid potential data loss in phase 2 of garbage collection
>> in case the atime update is not honored.
>>
>> Perform the check before phase 1 of garbage collection, as well as
>> on datastore creation. The latter to early detect and disallow
>> datastore creation on filesystem configurations which otherwise most
>> likely would lead to data losses.
>>
>> Enable the atime update check by default, but allow to opt-out by
>> setting a datastore tuning parameter flag for backwards compatibility.
>> This is honored by both, garbage collection and datastore creation.
>>
>> The check uses a 4 MiB fixed sized, unencypted and compressed chunk
>> as test marker, inserted if not present. This all zero-chunk is very
>> likely anyways for unencrypted backup contents with large all-zero
>> regions using fixed size chunking (e.g. VMs).
>>
>> The check sets the chunk's atime twice, once to set it to 1 second
>> into the past, and once to set it to now. By this one can detect if
>> the atime is set at all and if it also set if the filesystem is e.g.
>> mounted via relatime, which is intended to deferr the atime update.
> 
> that's not how relatime works? see below..
> 
>>
>> Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=5982
>> Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
>> ---
>> changes since version 1:
>> - Use all-zero fixed size chunk for atime update test
>> - Test on datastore creation and before garbage collection phase 1
>> - fail datastore creation and garbage collection jobs if check fails,
>>    but keep opt-out
>>
>>   pbs-datastore/src/chunk_store.rs | 80 ++++++++++++++++++++++++++++++--
>>   pbs-datastore/src/datastore.rs   | 10 ++++
>>   src/api2/config/datastore.rs     |  1 +
>>   3 files changed, 86 insertions(+), 5 deletions(-)
>>
>> diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs
>> index 7bdcb0297..be57e3c87 100644
>> --- a/pbs-datastore/src/chunk_store.rs
>> +++ b/pbs-datastore/src/chunk_store.rs
>> @@ -1,6 +1,7 @@
>>   use std::os::unix::io::AsRawFd;
>>   use std::path::{Path, PathBuf};
>>   use std::sync::{Arc, Mutex};
>> +use std::time::{Duration, SystemTime, UNIX_EPOCH};
>>   
>>   use anyhow::{bail, format_err, Error};
>>   use tracing::info;
>> @@ -13,6 +14,7 @@ use proxmox_sys::process_locker::{
>>   };
>>   use proxmox_worker_task::WorkerTaskContext;
>>   
>> +use crate::data_blob::DataChunkBuilder;
>>   use crate::file_formats::{
>>       COMPRESSED_BLOB_MAGIC_1_0, ENCRYPTED_BLOB_MAGIC_1_0, UNCOMPRESSED_BLOB_MAGIC_1_0,
>>   };
>> @@ -96,6 +98,7 @@ impl ChunkStore {
>>           uid: nix::unistd::Uid,
>>           gid: nix::unistd::Gid,
>>           sync_level: DatastoreFSyncLevel,
>> +        atime_update_check: bool,
>>       ) -> Result<Self, Error>
>>       where
>>           P: Into<PathBuf>,
>> @@ -150,7 +153,16 @@ impl ChunkStore {
>>               }
>>           }
>>   
>> -        Self::open(name, base, sync_level)
>> +        let chunk_store = Self::open(name, base, sync_level)?;
>> +        if atime_update_check {
>> +            chunk_store
>> +                .check_fs_atime_update()
>> +                .map(|()| info!("atime update check successful."))?;
>> +        } else {
>> +            info!("atime update check skipped.");
>> +        }
>> +
>> +        Ok(chunk_store)
>>       }
>>   
>>       fn lockfile_path<P: Into<PathBuf>>(base: P) -> PathBuf {
>> @@ -445,6 +457,51 @@ impl ChunkStore {
>>           Ok(())
>>       }
>>   
>> +    /// Check if atime updates are honored by the filesystem backing the chunk store.
>> +    ///
>> +    /// Sets the atime twice, once into the past by one second, checking that this was honored and
>> +    /// then to now again, as otherwise it is not guaranteed to also cover time ranges by `relatime`.
>> +    /// Uses a 4 MiB fixed size, compressed but unencrypted chunk to test. The chunk is inserted in
>> +    /// the chunk store if not yet present.
>> +    pub fn check_fs_atime_update(&self) -> Result<(), Error> {
>> +        let (zero_chunk, digest) = DataChunkBuilder::build_zero_chunk(None, 4096 * 1024, true)?;
>> +        self.insert_chunk(&zero_chunk, &digest)?;
>> +
>> +        let past = SystemTime::now().duration_since(UNIX_EPOCH)? - Duration::from_secs(1);
>> +        let times: [libc::timespec; 2] = [
>> +            libc::timespec {
>> +                tv_sec: past.as_secs().try_into()?,
>> +                tv_nsec: past.subsec_nanos().into(),
>> +            },
>> +            libc::timespec {
>> +                tv_sec: 0,
>> +                tv_nsec: UTIME_OMIT,
>> +            },
>> +        ];
> 
> this sets *atime* one second in the past, while not touching *mtime*.
> this is not something we ever need? if this fails, something is weird,
> but it wouldn't mean that atime handling was broken for our use case..
> 
>> +
>> +        use nix::NixPath;
>> +        let (path, _digest) = self.chunk_path(&digest);
>> +        path.with_nix_path(|cstr| unsafe {
>> +            let tmp = libc::utimensat(-1, cstr.as_ptr(), &times[0], libc::AT_SYMLINK_NOFOLLOW);
>> +            nix::errno::Errno::result(tmp)
>> +        })??;
>> +
>> +        let metadata_past = std::fs::metadata(&path).map_err(Error::from)?;
>> +        let atime_past = metadata_past.accessed()?;
>> +        if past != atime_past.duration_since(UNIX_EPOCH)? {
>> +            bail!("setting atime to the past failed, is atime support enabled on datastore backing filesystem?");
>> +        }
>> +
>> +        self.cond_touch_chunk(&digest, true)?;
>> +
>> +        let metadata_now = std::fs::metadata(&path).map_err(Error::from)?;
>> +        let atime_now = metadata_now.accessed()?;
>> +        if atime_past >= atime_now {
>> +            bail!("atime update check failed, is atime support enabled on datastore backing filesystem?");
>> +        }
> 
> if something implements explicit timestamp updates to work like relatime
> does for read-triggered updates, this would still potentially fail -
> relatime covers checking *mtime* (which we haven't updated above!) first
> to see if *atime* should be updated when doing reads. it's basically a
> performance optimization (skip most atime updates for read-heavy work
> loads) without breaking a very specific use case (that of mutt abusing
> atime to know when it last read a local mailbox[0], which would break if
> atime were never updated on reads).

Thanks a lot for the pointer! Need to dig a bit deeper into how relatime 
works, as my understanding of it is clearly flawed.

> I think what we actually want to do at GC time is:
> 
> (potentially) insert the chunk
> stat the chunk
> touch the chunk (regularly)
> stat the chunk again
> compare the timestamps

That was my initial approach, which failed because of the timestamp 
granularity and since I didn't want to wait in-between the stat and 
touch the approach with setting the atime into the past first came to 
mind, therefore my implementation. But I see that this does not work 
based on your feedback.

> we now have three cases:
> 
> A) the old atime is before the new atime => the storage doesn't discard
> *all* atime updates and we can proceed
> B) the old atime and the new atime are identical and atime is *before*
> mtime and ctime => atime should have been updated even under
> "relatime-like" semantics, very likely this storage is discarding all
> atime updates, we should abort with an error
> C) the old atime and the new atime are identical, but atime is *after*
> ctime/mtime => we need to check whether we just ran into relatime-like
> behaviour
> 
> C means we either abort as well (if we don't want to support such
> setups), or we try to detect if setting both mtime and atime works, and
> force the GC into doing that
> 
> we might want to add a "wait a second" between the first stat and the
> touching to account for filesystems with more granular timestamps
> (inserting touches, and other tasks might touch at "the same time" as
> well by accident), since the requested timestamps might be modified to
> become "the greatest value supported by the filesystem that is not
> greater than the specified time" (utimensat(2)), and one second doesn't
> really hurt is in the scope of a GC..
> 
> note that the 24h grace period is totally independent of this - it comes
> from lazytime and relates to how often changes of timestamps are
> *persisted on disk*. that shouldn't affect us for local storages (since
> we talk to the kernel and always get the updated view anyway, no matter
> if it is in-memory or on-disk. if the PBS system crashes GC is gone as
> well and needs to start over, no risk of inconsistency).
> 
> we might have a problem if the storage uses lazytime internally and
> lives somewhere else
> - GC starts phase1
> - updates a few atimes
> - storage crashes and loses some of the updates while I/O to it blocks
> - storage comes back up
> - GC continues not knowing that some of the atime updates have been lost
>    in the ether
> - GC enters phase2
> - chunk timestamp makes it eligible for deletion, even though it was
>    touched in phase1
> - data loss
> 
> I don't think we can really prevent this other than by ditching the
> whole atime mechanism and instead keep a list of written chunks in each
> backup writer, and generate such a list in phase1, merging them all and
> using that for phase2 (which is really expensive as it scales with the
> number of touched chunks and would require syncing up writers and GC).
> thankfully I'd expect most such failure scenarios to actually abort the
> GC anyway for I/O error reasons..

Maybe one could use `statmount` to determine if the `SB_LAZYTIME` flag 
in `smbuf.sb_flags` is set? But I assume that you meant the issue being 
this not exposed by the filesystem...

> at datastore creation we could do more in-depth checks (including
> "faking" past timestamps and seeing if updates are honored for those if
> they are not honored when doing the simple tests) - if we think we gain
> something from that?
> 
> 0: http://www.mutt.org/doc/manual/#new-mail-formats
> 
>> +        Ok(())
>> +    }
>> +
>>       pub fn insert_chunk(&self, chunk: &DataBlob, digest: &[u8; 32]) -> Result<(bool, u64), Error> {
>>           // unwrap: only `None` in unit tests
>>           assert!(self.locker.is_some());
>> @@ -631,8 +688,15 @@ fn test_chunk_store1() {
>>       let user = nix::unistd::User::from_uid(nix::unistd::Uid::current())
>>           .unwrap()
>>           .unwrap();
>> -    let chunk_store =
>> -        ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None).unwrap();
>> +    let chunk_store = ChunkStore::create(
>> +        "test",
>> +        &path,
>> +        user.uid,
>> +        user.gid,
>> +        DatastoreFSyncLevel::None,
>> +        true,
>> +    )
>> +    .unwrap();
>>   
>>       let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8])
>>           .build()
>> @@ -644,8 +708,14 @@ fn test_chunk_store1() {
>>       let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap();
>>       assert!(exists);
>>   
>> -    let chunk_store =
>> -        ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None);
>> +    let chunk_store = ChunkStore::create(
>> +        "test",
>> +        &path,
>> +        user.uid,
>> +        user.gid,
>> +        DatastoreFSyncLevel::None,
>> +        true,
>> +    );
>>       assert!(chunk_store.is_err());
>>   
>>       if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
>> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
>> index 75c0c16ab..9b01790cc 100644
>> --- a/pbs-datastore/src/datastore.rs
>> +++ b/pbs-datastore/src/datastore.rs
>> @@ -1170,6 +1170,16 @@ impl DataStore {
>>                   upid: Some(upid.to_string()),
>>                   ..Default::default()
>>               };
>> +            let tuning: DatastoreTuning = serde_json::from_value(
>> +                DatastoreTuning::API_SCHEMA
>> +                    .parse_property_string(gc_store_config.tuning.as_deref().unwrap_or(""))?,
>> +            )?;
>> +            if tuning.gc_atime_check.unwrap_or(true) {
>> +                self.inner.chunk_store.check_fs_atime_update()?;
>> +                info!("Filesystem atime update check successful.");
>> +            } else {
>> +                info!("Filesystem atime update check disabled by datastore tuning options.");
>> +            }
>>   
>>               info!("Start GC phase1 (mark used chunks)");
>>   
>> diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs
>> index fe3260f6d..1cd38f2db 100644
>> --- a/src/api2/config/datastore.rs
>> +++ b/src/api2/config/datastore.rs
>> @@ -119,6 +119,7 @@ pub(crate) fn do_create_datastore(
>>                   backup_user.uid,
>>                   backup_user.gid,
>>                   tuning.sync_level.unwrap_or_default(),
>> +                tuning.gc_atime_check.unwrap_or(true),
>>               )
>>               .map(|_| ())
>>           } else {
>> -- 
>> 2.39.5
>>
>>
>>
>> _______________________________________________
>> pbs-devel mailing list
>> pbs-devel at lists.proxmox.com
>> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
>>
>>
>>
> 
> 
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
> 
> 





More information about the pbs-devel mailing list