[pbs-devel] [PATCH v5 proxmox-backup 26/28] pxar: add heuristic to reduce reused chunk fragmentation

Christian Ebner c.ebner at proxmox.com
Wed Nov 15 16:48:11 CET 2023


For multiple consecutive runs with metadata based file change detection
the referencing of existing chunks can lead to fragmentation and an
increased size of the index file.

In order to reduce this, check which chunks relative size has been
referenced by the previous run and re-chunk files where a constant
threshold value has been underrun.

Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
---
Changes since version 4:
- no changes

Changes since version 3:
- no present in version 3

 pbs-client/src/pxar/create.rs | 55 +++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index 8d3db5e8..baeba859 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -1,7 +1,8 @@
-use std::collections::{HashMap, HashSet, VecDeque};
+use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
 use std::ffi::{CStr, CString, OsStr};
 use std::fmt;
 use std::io::{self, Read};
+use std::ops::Bound::Included;
 use std::os::unix::ffi::OsStrExt;
 use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::path::{Path, PathBuf};
@@ -36,6 +37,7 @@ use crate::pxar::tools::assert_single_path_component;
 use crate::pxar::Flags;
 
 const MAX_FILE_SIZE: u64 = 1024;
+const FRAGMENTATION_THRESHOLD: f64 = 0.75;
 
 /// Pxar options for creating a pxar archive/stream
 #[derive(Default)]
@@ -236,6 +238,7 @@ struct Archiver {
     forced_boundaries: Arc<Mutex<VecDeque<InjectChunks>>>,
     appendix: Appendix,
     prev_appendix: Option<AppendixStartOffset>,
+    ref_offsets: BTreeMap<u64, u64>,
 }
 
 type Encoder<'a, T> = pxar::encoder::aio::Encoder<'a, T>;
@@ -291,21 +294,23 @@ where
         )?);
     }
 
-    let (appendix_start, prev_cat_parent) = if let Some(ref mut prev_ref) = options.previous_ref {
-        let entry = prev_ref
-            .catalog
-            .lookup_recursive(prev_ref.archive_name.as_bytes())?;
-        let parent = match entry.attr {
-            DirEntryAttribute::Archive { .. } => Some(entry),
-            _ => None,
+    let (appendix_start, prev_cat_parent, ref_offsets) =
+        if let Some(ref mut prev_ref) = options.previous_ref {
+            let entry = prev_ref
+                .catalog
+                .lookup_recursive(prev_ref.archive_name.as_bytes())?;
+            let parent = match entry.attr {
+                DirEntryAttribute::Archive { .. } => Some(entry),
+                _ => None,
+            };
+            let appendix_start = prev_ref
+                .catalog
+                .appendix_offset(prev_ref.archive_name.as_bytes())?;
+            let ref_offsets = prev_ref.catalog.fetch_offsets()?;
+            (appendix_start, parent, ref_offsets)
+        } else {
+            (None, None, BTreeMap::new())
         };
-        let appendix_start = prev_ref
-            .catalog
-            .appendix_offset(prev_ref.archive_name.as_bytes())?;
-        (appendix_start, parent)
-    } else {
-        (None, None)
-    };
 
     let mut archiver = Archiver {
         feature_flags,
@@ -325,6 +330,7 @@ where
         forced_boundaries,
         appendix: Appendix::new(),
         prev_appendix: appendix_start,
+        ref_offsets,
     };
 
     if let Some(ref mut catalog) = archiver.catalog {
@@ -771,6 +777,25 @@ impl Archiver {
         let (indices, start_padding, end_padding) =
             prev_ref.index.indices(start_offset, end_offset)?;
 
+        // Heuristic to reduce chunk fragmentation by only referencing files if the previous
+        // run referenced the files. If the files spans more than 2 chunks, it should always be
+        // referenced, otherwise check if at least 3/4 of the chunk where referenced in the
+        // last backup run.
+        if indices.len() < 3 {
+            let chunk_sum = indices.iter().fold(0f64, |sum, ind| sum + ind.end() as f64);
+            let chunks_start = start_offset - start_padding;
+            let chunks_end = end_offset + end_padding;
+            let refs_sum = self
+                .ref_offsets
+                .range((Included(chunks_start), Included(chunks_end)))
+                .fold(0f64, |sum, (_, size)| sum + *size as f64);
+
+            // Does not cover the attribute size information
+            if refs_sum / chunk_sum < FRAGMENTATION_THRESHOLD {
+                return Ok(false);
+            }
+        }
+
         let appendix_ref_offset = self.appendix.insert(indices, start_padding);
         let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
         self.add_appendix_ref(encoder, file_name, appendix_ref_offset, file_size)
-- 
2.39.2






More information about the pbs-devel mailing list