[pbs-devel] [PATCH proxmox-backup 4/4] fix #5799: GC: track chunk digests and accumulate statistics

Christian Ebner c.ebner at proxmox.com
Mon Jan 19 14:27:07 CET 2026


Keep track of all digests referenced by snapshots index files
encountered during phase 1 of garbage collection in the reverse
lookup table and fill in raw chunk size information during phase 2.

Allows to finally gather the unique count and raw size information
printed at the end of garbage collection.

Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
---
 pbs-datastore/src/chunk_store.rs |  9 +++++++
 pbs-datastore/src/datastore.rs   | 46 ++++++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs
index e7e94b29f..3bf21e1eb 100644
--- a/pbs-datastore/src/chunk_store.rs
+++ b/pbs-datastore/src/chunk_store.rs
@@ -434,6 +434,7 @@ impl ChunkStore {
         status: &mut GarbageCollectionStatus,
         worker: &dyn WorkerTaskContext,
         cache: Option<&LocalDatastoreLruCache>,
+        mut digest_map: Option<&mut crate::reverse_digest_map::ReverseDigestMap>,
     ) -> Result<(), Error> {
         // unwrap: only `None` in unit tests
         assert!(self.locker.is_some());
@@ -524,6 +525,14 @@ impl ChunkStore {
                         },
                     )?;
                 }
+
+                // Chunk info not inserted if no already present in mapping
+                if chunk_ext == ChunkExt::None {
+                    if let Some(ref mut map) = digest_map {
+                        let digest = <[u8; 32]>::from_hex(filename.to_bytes())?;
+                        map.set_raw_chunk_size(&digest, stat.st_size as u64);
+                    }
+                }
             }
             drop(lock);
         }
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index 7ad3d917d..7efa15335 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -45,6 +45,7 @@ use crate::dynamic_index::{DynamicIndexReader, DynamicIndexWriter};
 use crate::fixed_index::{FixedIndexReader, FixedIndexWriter};
 use crate::hierarchy::{ListGroups, ListGroupsType, ListNamespaces, ListNamespacesRecursive};
 use crate::index::IndexFile;
+use crate::reverse_digest_map::{DigestStatAccumulator, ReverseDigestMap};
 use crate::s3::S3_CONTENT_PREFIX;
 use crate::task_tracking::{self, update_active_operations};
 use crate::{DataBlob, LocalDatastoreLruCache};
@@ -1433,6 +1434,7 @@ impl DataStore {
         index: Box<dyn IndexFile>,
         file_name: &Path, // only used for error reporting
         chunk_lru_cache: &mut Option<LruCache<[u8; 32], ()>>,
+        mut digest_map: Option<ReverseMap>,
         status: &mut GarbageCollectionStatus,
         worker: &dyn WorkerTaskContext,
         s3_client: Option<Arc<S3Client>>,
@@ -1443,7 +1445,13 @@ impl DataStore {
         for pos in 0..index.index_count() {
             worker.check_abort()?;
             worker.fail_on_shutdown()?;
-            let digest = index.index_digest(pos).unwrap();
+            let chunk_info = index.chunk_info(pos).unwrap();
+            let digest = &chunk_info.digest;
+
+            if let Some(map) = digest_map.as_mut() {
+                map.digests
+                    .insert(digest, map.namespace, map.snapshot, chunk_info.size());
+            }
 
             // Avoid multiple expensive atime updates by utimensat
             if let Some(chunk_lru_cache) = chunk_lru_cache {
@@ -1493,6 +1501,7 @@ impl DataStore {
         worker: &dyn WorkerTaskContext,
         cache_capacity: usize,
         s3_client: Option<Arc<S3Client>>,
+        mut digest_map: Option<&mut ReverseDigestMap>,
     ) -> Result<(), Error> {
         // Iterate twice over the datastore to fetch index files, even if this comes with an
         // additional runtime cost:
@@ -1522,7 +1531,7 @@ impl DataStore {
             .context("creating namespace iterator failed")?
         {
             let namespace = namespace.context("iterating namespaces failed")?;
-            for group in arc_self.iter_backup_groups(namespace)? {
+            for group in arc_self.iter_backup_groups(namespace.clone())? {
                 let group = group.context("iterating backup groups failed")?;
 
                 // Avoid race between listing/marking of snapshots by GC and pruning the last
@@ -1580,10 +1589,21 @@ impl DataStore {
                                 }
                             };
 
+                            let digest_map = if let Some(digests) = digest_map.as_mut() {
+                                Some(ReverseMap {
+                                    digests,
+                                    namespace: &namespace,
+                                    snapshot: snapshot.backup_dir.dir(),
+                                })
+                            } else {
+                                None
+                            };
+
                             self.index_mark_used_chunks(
                                 index,
                                 &path,
                                 &mut chunk_lru_cache,
+                                digest_map,
                                 status,
                                 worker,
                                 s3_client.as_ref().cloned(),
@@ -1625,6 +1645,7 @@ impl DataStore {
                 index,
                 &path,
                 &mut chunk_lru_cache,
+                None,
                 status,
                 worker,
                 s3_client.as_ref().cloned(),
@@ -1766,11 +1787,14 @@ impl DataStore {
 
         info!("Start GC phase1 (mark used chunks)");
 
+        let mut digest_map = Some(ReverseDigestMap::default());
+
         self.mark_used_chunks(
             &mut gc_status,
             worker,
             gc_cache_capacity,
             s3_client.as_ref().cloned(),
+            digest_map.as_mut(),
         )
         .context("marking used chunks failed")?;
 
@@ -1796,6 +1820,10 @@ impl DataStore {
                             None => continue,
                         };
 
+                    if let Some(map) = digest_map.as_mut() {
+                        map.set_raw_chunk_size(&digest, content.size);
+                    }
+
                     let timeout = std::time::Duration::from_secs(0);
                     let _chunk_guard = match self.inner.chunk_store.lock_chunk(&digest, timeout) {
                         Ok(guard) => guard,
@@ -1892,6 +1920,7 @@ impl DataStore {
                 &mut tmp_gc_status,
                 worker,
                 self.cache(),
+                None,
             )?;
         } else {
             self.inner.chunk_store.sweep_unused_chunks(
@@ -1900,6 +1929,7 @@ impl DataStore {
                 &mut gc_status,
                 worker,
                 None,
+                digest_map.as_mut(),
             )?;
         }
 
@@ -1913,6 +1943,12 @@ impl DataStore {
                 );
             }
         }
+
+        if let Some(digest_map) = digest_map.take() {
+            let accumulator = DigestStatAccumulator::default();
+            accumulator.accumulate_and_list(digest_map);
+        }
+
         info!(
             "Removed garbage: {}",
             HumanByte::from(gc_status.removed_bytes),
@@ -2877,3 +2913,9 @@ impl S3DeleteList {
         Ok(())
     }
 }
+
+struct ReverseMap<'a> {
+    digests: &'a mut ReverseDigestMap,
+    namespace: &'a pbs_api_types::BackupNamespace,
+    snapshot: &'a pbs_api_types::BackupDir,
+}
-- 
2.47.3





More information about the pbs-devel mailing list