[pbs-devel] [PATCH proxmox-backup 15/17] GC: fix race with chunk upload/insert on s3 backends
Christian Ebner
c.ebner at proxmox.com
Mon Nov 3 12:31:18 CET 2025
The previous approach relying soly on local marker files was flawed
as it could not protect against all the possible races between chunk
upload to the s3 object store, insertion in the local datastore cache
and in-memory LRU cache.
Since these operations are now protected by getting a per-chunk file
lock, use that to check if it is safe to remove the chunk and do so
in a consistent manner by holding the chunk lock guard until it is
actually removed from the s3 backend and the caches.
Since an error when removing the chunk from cache could lead to
inconsistencies, GC must now fail in that case.
The chunk store lock is now only required on cache remove.
Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
---
pbs-datastore/src/datastore.rs | 27 ++++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index b36424131..beca72e9a 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -1639,14 +1639,18 @@ impl DataStore {
let mut delete_list = Vec::with_capacity(1000);
loop {
- let lock = self.inner.chunk_store.mutex().lock().unwrap();
-
for content in list_bucket_result.contents {
let (chunk_path, digest) = match self.chunk_path_from_object_key(&content.key) {
Some(path) => path,
None => continue,
};
+ let timeout = std::time::Duration::from_secs(0);
+ let _chunk_guard = match self.inner.chunk_store.lock_chunk(&digest, timeout) {
+ Ok(guard) => guard,
+ Err(_) => continue,
+ };
+
// Check local markers (created or atime updated during phase1) and
// keep or delete chunk based on that.
let atime = match std::fs::metadata(&chunk_path) {
@@ -1675,10 +1679,10 @@ impl DataStore {
&mut gc_status,
|| {
if let Some(cache) = self.cache() {
- // ignore errors, phase 3 will retry cleanup anyways
- let _ = cache.remove(&digest);
+ let _guard = self.inner.chunk_store.mutex().lock().unwrap();
+ cache.remove(&digest)?;
}
- delete_list.push(content.key);
+ delete_list.push((content.key, _chunk_guard));
Ok(())
},
)?;
@@ -1687,14 +1691,19 @@ impl DataStore {
chunk_count += 1;
}
- drop(lock);
-
if !delete_list.is_empty() {
- let delete_objects_result =
- proxmox_async::runtime::block_on(s3_client.delete_objects(&delete_list))?;
+ let delete_objects_result = proxmox_async::runtime::block_on(
+ s3_client.delete_objects(
+ &delete_list
+ .iter()
+ .map(|(key, _)| key.clone())
+ .collect::<Vec<S3ObjectKey>>(),
+ ),
+ )?;
if let Some(_err) = delete_objects_result.error {
bail!("failed to delete some objects");
}
+ // release all chunk guards
delete_list.clear();
}
--
2.47.3
More information about the pbs-devel
mailing list