[pbs-devel] [PATCH v3 proxmox-backup 4/9] fix #5982: garbage collection: check atime updates are honored

Christian Ebner c.ebner at proxmox.com
Wed Mar 5 11:31:41 CET 2025


On 3/5/25 10:41, Fabian Grünbichler wrote:
> On March 4, 2025 7:35 pm, Christian Ebner wrote:
>> Check if the filesystem backing the chunk store actually updates the
>> atime to avoid potential data loss in phase 2 of garbage collection,
>> in case the atime update is not honored.
>>
>> Perform the check before phase 1 of garbage collection, as well as
>> on datastore creation. The latter to early detect and disallow
>> datastore creation on filesystem configurations which otherwise most
>> likely would lead to data losses.
>>
>> Enable the atime update check by default, but allow to opt-out by
>> setting a datastore tuning parameter flag for backwards compatibility.
>> This is honored by both, garbage collection and datastore creation.
>>
>> The check uses a 4 MiB fixed sized, unencypted and compressed chunk
>> as test marker, inserted if not present. This all zero-chunk is very
>> likely anyways for unencrypted backup contents with large all-zero
>> regions using fixed size chunking (e.g. VMs).
>>
>> To avoid cases were the timestamp will not be updated because of the
>> Linux kernels timestamp granularity, sleep in-between stating and
>> utimensat for 1 second.
>>
>> Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=5982
>> Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
>> ---
>> changes since version 2:
>> - Take Linux timestamp granularity into account by sleeping 1 second
>>    in-between operations instead of setting timestamp to the past.
>> - Check relatime behavior if atime behaviour not honored.
>> - adapt datastore tuning variables to new names
>>
>>
>>   pbs-datastore/src/chunk_store.rs | 128 +++++++++++++++++++++++++++++--
>>   pbs-datastore/src/datastore.rs   |  10 +++
>>   src/api2/config/datastore.rs     |   1 +
>>   3 files changed, 134 insertions(+), 5 deletions(-)
>>
>> diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs
>> index 5e02909a1..e529dcc9c 100644
>> --- a/pbs-datastore/src/chunk_store.rs
>> +++ b/pbs-datastore/src/chunk_store.rs
>> @@ -1,6 +1,8 @@
>> +use std::cmp::Ordering;
>>   use std::os::unix::io::AsRawFd;
>>   use std::path::{Path, PathBuf};
>>   use std::sync::{Arc, Mutex};
>> +use std::time::Duration;
>>   
>>   use anyhow::{bail, format_err, Error};
>>   use tracing::info;
>> @@ -13,6 +15,7 @@ use proxmox_sys::process_locker::{
>>   };
>>   use proxmox_worker_task::WorkerTaskContext;
>>   
>> +use crate::data_blob::DataChunkBuilder;
>>   use crate::file_formats::{
>>       COMPRESSED_BLOB_MAGIC_1_0, ENCRYPTED_BLOB_MAGIC_1_0, UNCOMPRESSED_BLOB_MAGIC_1_0,
>>   };
>> @@ -93,6 +96,7 @@ impl ChunkStore {
>>           uid: nix::unistd::Uid,
>>           gid: nix::unistd::Gid,
>>           sync_level: DatastoreFSyncLevel,
>> +        atime_safety_check: bool,
>>       ) -> Result<Self, Error>
>>       where
>>           P: Into<PathBuf>,
>> @@ -147,7 +151,20 @@ impl ChunkStore {
>>               }
>>           }
>>   
>> -        Self::open(name, base, sync_level)
>> +        let chunk_store = Self::open(name, base, sync_level)?;
>> +        if atime_safety_check {
>> +            chunk_store
>> +                .atime_safety_check()
>> +                .map(|atime_updated| if atime_updated {
>> +                    info!("atime safety check successful.")
>> +                } else {
>> +                    info!("atime safety check successful with relatime behaviour.")
>> +                })?;
>> +        } else {
>> +            info!("atime safety check skipped.");
>> +        }
>> +
>> +        Ok(chunk_store)
>>       }
>>   
>>       fn lockfile_path<P: Into<PathBuf>>(base: P) -> PathBuf {
>> @@ -442,6 +459,94 @@ impl ChunkStore {
>>           Ok(())
>>       }
>>   
>> +    /// Check if atime updates are honored by the filesystem backing the chunk store.
>> +    ///
>> +    /// Checks if the atime is either update immediately by utimensat or in a relatime manner by
>> +    /// first setting atime and mtime to now, followed by trying to update the atime.
>> +    /// If the atime update is honored, return with true, if the relatime update has been honored,
>> +    /// return with false. Return with error otherwise.
>> +    /// Uses a 4 MiB fixed size, compressed but unencrypted chunk to test. The chunk is inserted in
>> +    /// the chunk store if not yet present.
>> +    pub fn atime_safety_check(&self) -> Result<bool, Error> {
>> +        let (zero_chunk, digest) = DataChunkBuilder::build_zero_chunk(None, 4096 * 1024, true)?;
>> +        self.insert_chunk(&zero_chunk, &digest)?;
> 
> we might want to remember whether we insert or not (return value here)
> for log output below.

Acked, will include this information as well.

> 
>> +        let (path, _digest) = self.chunk_path(&digest);
>> +
>> +        let metadata_before = std::fs::metadata(&path).map_err(Error::from)?;
>> +        let atime_before = metadata_before.accessed()?;
>> +
>> +        // Take into account timestamp update granularity in the kernel
>> +        std::thread::sleep(Duration::from_secs(1));
> 
> small nit: if we re-order the stat and sleep above, we have higher
> chances of actually testing our change and not some side-effect of
> concurrent actions (see below).

Ah yes, that's way better, a lot can happen within a second...

>> +        self.cond_touch_path(&path, true)?;
> 
> it might be worth a comment to note that this is actually most likely
> the second touch (the first one having happened by insert_chunk above).

Okay, will include a comment mentioning this explicitly.

> this means this test actually doesn't just test that atime updating
> works, but that file creation/touching followed by a second touch
> shortly after works, which is much more like what we want for GC :)
> 
>> +
>> +        let metadata_now = std::fs::metadata(&path).map_err(Error::from)?;
>> +        let atime_now = metadata_now.accessed()?;
>> +
>> +        match atime_before.cmp(&atime_now) {
>> +            Ordering::Less => Ok(true),
> 
> there is a small risk of false positives here if some other action on
> the file path caused atime to change (e.g., a backup uploading a
> better-compressed copy of the zero chunk that becomes visible exactly at
> the right moment, or whatever).
> 
> not sure whether we want to care about that, but wanted to mention it
> anyway for completeness' sake. we could improve robustness by checking
> that the inode and birth time is still the same, and retry the check if
> not? it is probably really unlikely to hit in practice ;)

Yes, had this in mind as well yesterday, another option would be to not 
rely on an actual chunk file for the test, but rather use a different 
file. But I discarded that option mainly because I didn't want to 
pollute the chunk store with other files and it actually does not fix 
the issue (the file might still be touched unexpectedly by something 
else). So checking the inode and comparing the ctime is a good 
alternative, with little cost. Will incorporate this additional checks.

> 
>> +            Ordering::Equal => {
>> +                // Use the previous mtime here, as that is the one the atime
>> +                // before update check will been compared to
>> +                let mtime_before = metadata_before.modified()?;
>> +                if atime_now < mtime_before {
>> +                    Err(format_err!(
>> +                        "atime safety check failed, is atime support enabled on datastore backing \
>> +                        filesystem?"
>> +                    ))
>> +                } else {
>> +                    self.relatime_safety_check(&path)?;
>> +                    Ok(false)
> 
> this would be very unexpected behaviour (it would probably make sense to
> systematically test *all common* storages that we can easily test to see
> if any exhibit this behaviour?), and if it occurs, how can we be sure
> that it would honor the 24h (since the behaviour of that
> filesystem/storage is already out of spec).
> 
> I think if we want to go down that route, we'd need to switch the
> touching over to also set the mtime to be safe. we know touching in that
> fashion works (since we test it here), we don't know much else about
> it's semantics. or we say such setups are unsupported, and require users
> to explicitly opt into potentially unsafe operations by disabling the
> check.

Acked, I will check if I can find a filesystem which does behave as 
tested, but if none is to be found I would opt for the latter option, 
making this an error and only allow the users to opt-out.
If there are then reports, it is still possible to extend these checks 
and adapt GC to use mtime as well or even go for a totally different 
approach.

> 
>> +                }
>> +            }
>> +            Ordering::Greater => Err(format_err!(
>> +                "atime safety check failed, unexpected time shift"
>> +            )),
>> +        }
>> +    }
>> +
>> +    fn relatime_safety_check(&self, path: &Path) -> Result<(), Error> {
>> +        // unwrap: only `None` in unit tests
>> +        assert!(self.locker.is_some());
>> +
>> +        // Update atime and mtime to now
>> +        let times: [libc::timespec; 2] = [
>> +            libc::timespec {
>> +                tv_sec: 0,
>> +                tv_nsec: libc::UTIME_NOW,
>> +            },
>> +            libc::timespec {
>> +                tv_sec: 0,
>> +                tv_nsec: libc::UTIME_NOW,
>> +            },
>> +        ];
> 
> this is identical to passing null to utimensat and could be written as
> such ;)

Acked, will adapt this accordingly.

> 
>> +
>> +        use nix::NixPath;
>> +        if let Err(err) = path.with_nix_path(|cstr| unsafe {
>> +            let tmp = libc::utimensat(-1, cstr.as_ptr(), &times[0], libc::AT_SYMLINK_NOFOLLOW);
>> +            nix::errno::Errno::result(tmp)
>> +        })? {
>> +            bail!("update atime failed for chunk/file {path:?} - {err}");
> 
> nit: error message is wrong

Acked!

> 
>> +        }
>> +
>> +        // Take into account timestamp update granularity in the kernel
>> +        std::thread::sleep(Duration::from_secs(1));
>> +        // Try updating the chunks atime, which should be performed for filesystems
>> +        // mounted with relatime since mtime is equal
> 
> this is a misleading comment. we don't care about normal filesystems
> being mounted relatime.

Ah, yes this is outdated, leftover from my iterations with lacking 
understanding of the implications from lazytime and relatime, which we 
cleared up yesterday.

> 
> what we are checking here is whether some weird filesystem/storage uses
> "something like relatime semantics" for explicit timestamp updates. the
> kernel itself doesn't, it resolves the explicit update and sets the file
> attributes via the filesystem (utimensat -> .. -> notify_changed ->
> setattr), which is a totally different code path than relatime handling
> which only applies for implicit, automatic updates when doing *other* file
> accesses (file_accessed -> touch_atime -> atime_needs_update, which is
> also where noatime is handled).
> 
> lazytime is handled in yet another fashion (the inode is marked as
> having dirty timestamps, and that is checked in various places to decide
> whether to sync it out or not, but the inode in memory is always
> correctly updated).
> 
>> +        self.cond_touch_path(&path, true)?;
>> +
>> +        let metadata_now = std::fs::metadata(&path).map_err(Error::from)?;
>> +        let atime_now = metadata_now.accessed()?;
>> +        let mtime_now = metadata_now.modified()?;
>> +        if atime_now <= mtime_now {
> 
> we still want to check that atime_now is later than atime_before (and
> maybe that mtime_now is later than mtime_before), this condition is
> wrong..

I see, will adapt  accordingly, thx!

> 
>> +            bail!(
>> +                "atime safety check failed and relatime update failed, is atime support enabled on \
>> +                datastore backing filesystem?"
>> +            )
>> +        }
>> +
>> +        Ok(())
>> +    }
>> +
>>       pub fn insert_chunk(&self, chunk: &DataBlob, digest: &[u8; 32]) -> Result<(bool, u64), Error> {
>>           // unwrap: only `None` in unit tests
>>           assert!(self.locker.is_some());
>> @@ -628,8 +733,15 @@ fn test_chunk_store1() {
>>       let user = nix::unistd::User::from_uid(nix::unistd::Uid::current())
>>           .unwrap()
>>           .unwrap();
>> -    let chunk_store =
>> -        ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None).unwrap();
>> +    let chunk_store = ChunkStore::create(
>> +        "test",
>> +        &path,
>> +        user.uid,
>> +        user.gid,
>> +        DatastoreFSyncLevel::None,
>> +        true,
>> +    )
>> +    .unwrap();
>>   
>>       let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8])
>>           .build()
>> @@ -641,8 +753,14 @@ fn test_chunk_store1() {
>>       let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap();
>>       assert!(exists);
>>   
>> -    let chunk_store =
>> -        ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None);
>> +    let chunk_store = ChunkStore::create(
>> +        "test",
>> +        &path,
>> +        user.uid,
>> +        user.gid,
>> +        DatastoreFSyncLevel::None,
>> +        true,
>> +    );
>>       assert!(chunk_store.is_err());
>>   
>>       if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
>> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
>> index 75c0c16ab..ef932b47b 100644
>> --- a/pbs-datastore/src/datastore.rs
>> +++ b/pbs-datastore/src/datastore.rs
>> @@ -1170,6 +1170,16 @@ impl DataStore {
>>                   upid: Some(upid.to_string()),
>>                   ..Default::default()
>>               };
>> +            let tuning: DatastoreTuning = serde_json::from_value(
>> +                DatastoreTuning::API_SCHEMA
>> +                    .parse_property_string(gc_store_config.tuning.as_deref().unwrap_or(""))?,
>> +            )?;
>> +            if tuning.gc_atime_safety_check.unwrap_or(true) {
>> +                self.inner.chunk_store.atime_safety_check()?;
>> +                info!("Filesystem atime safety check successful.");
>> +            } else {
>> +                info!("Filesystem atime safety check disabled by datastore tuning options.");
>> +            }
>>   
>>               info!("Start GC phase1 (mark used chunks)");
>>   
>> diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs
>> index fe3260f6d..35847fc45 100644
>> --- a/src/api2/config/datastore.rs
>> +++ b/src/api2/config/datastore.rs
>> @@ -119,6 +119,7 @@ pub(crate) fn do_create_datastore(
>>                   backup_user.uid,
>>                   backup_user.gid,
>>                   tuning.sync_level.unwrap_or_default(),
>> +                tuning.gc_atime_safety_check.unwrap_or(true),
>>               )
>>               .map(|_| ())
>>           } else {
>> -- 
>> 2.39.5
>>
>>
>>
>> _______________________________________________
>> pbs-devel mailing list
>> pbs-devel at lists.proxmox.com
>> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
>>
>>
>>
> 
> 
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
> 
> 





More information about the pbs-devel mailing list