[pbs-devel] [PATCH v4 proxmox-backup 25/26] pxar: add heuristic to reduce reused chunk fragmentation
Christian Ebner
c.ebner at proxmox.com
Thu Nov 9 19:46:13 CET 2023
For multiple consecutive runs with metadata based file change detection
the referencing of existing chunks can lead to fragmentation and an
increased size of the index file.
In order to reduce this, check which chunks relative size has been
referenced by the previous run and re-chunk files where a constant
threshold value has been underrun.
Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
---
Changes since version 3:
- no present in version 3
pbs-client/src/pxar/create.rs | 55 +++++++++++++++++++++++++----------
1 file changed, 40 insertions(+), 15 deletions(-)
diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index 8b283b23..56e1de7a 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -1,7 +1,8 @@
-use std::collections::{HashMap, HashSet, VecDeque};
+use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::ffi::{CStr, CString, OsStr};
use std::fmt;
use std::io::{self, Read};
+use std::ops::Bound::Included;
use std::os::unix::ffi::OsStrExt;
use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
use std::path::{Path, PathBuf};
@@ -36,6 +37,7 @@ use crate::pxar::tools::assert_single_path_component;
use crate::pxar::Flags;
const MAX_FILE_SIZE: u64 = 1024;
+const FRAGMENTATION_THRESHOLD: f64 = 0.75;
/// Pxar options for creating a pxar archive/stream
#[derive(Default)]
@@ -214,6 +216,7 @@ struct Archiver {
forced_boundaries: Arc<Mutex<VecDeque<InjectChunks>>>,
appendix: Appendix,
prev_appendix: Option<AppendixStartOffset>,
+ ref_offsets: BTreeMap<u64, u64>,
}
type Encoder<'a, T> = pxar::encoder::aio::Encoder<'a, T>;
@@ -265,21 +268,23 @@ where
)?);
}
- let (appendix_start, prev_cat_parent) = if let Some(ref mut prev_ref) = options.previous_ref {
- let entry = prev_ref
- .catalog
- .lookup_recursive(prev_ref.archive_name.as_bytes())?;
- let parent = match entry.attr {
- DirEntryAttribute::Archive { .. } => Some(entry),
- _ => None,
+ let (appendix_start, prev_cat_parent, ref_offsets) =
+ if let Some(ref mut prev_ref) = options.previous_ref {
+ let entry = prev_ref
+ .catalog
+ .lookup_recursive(prev_ref.archive_name.as_bytes())?;
+ let parent = match entry.attr {
+ DirEntryAttribute::Archive { .. } => Some(entry),
+ _ => None,
+ };
+ let appendix_start = prev_ref
+ .catalog
+ .appendix_offset(prev_ref.archive_name.as_bytes())?;
+ let ref_offsets = prev_ref.catalog.fetch_offsets()?;
+ (appendix_start, parent, ref_offsets)
+ } else {
+ (None, None, BTreeMap::new())
};
- let appendix_start = prev_ref
- .catalog
- .appendix_offset(prev_ref.archive_name.as_bytes())?;
- (appendix_start, parent)
- } else {
- (None, None)
- };
let mut archiver = Archiver {
feature_flags,
@@ -299,6 +304,7 @@ where
forced_boundaries,
appendix: Appendix::new(),
prev_appendix: appendix_start,
+ ref_offsets,
};
if let Some(ref mut catalog) = archiver.catalog {
@@ -745,6 +751,25 @@ impl Archiver {
let (indices, start_padding, end_padding) =
prev_ref.index.indices(start_offset, end_offset)?;
+ // Heuristic to reduce chunk fragmentation by only referencing files if the previous
+ // run referenced the files. If the files spans more than 2 chunks, it should always be
+ // referenced, otherwise check if at least 3/4 of the chunk where referenced in the
+ // last backup run.
+ if indices.len() < 3 {
+ let chunk_sum = indices.iter().fold(0f64, |sum, ind| sum + ind.end() as f64);
+ let chunks_start = start_offset - start_padding;
+ let chunks_end = end_offset + end_padding;
+ let refs_sum = self
+ .ref_offsets
+ .range((Included(chunks_start), Included(chunks_end)))
+ .fold(0f64, |sum, (_, size)| sum + *size as f64);
+
+ // Does not cover the attribute size information
+ if refs_sum / chunk_sum < FRAGMENTATION_THRESHOLD {
+ return Ok(false);
+ }
+ }
+
let appendix_ref_offset = self.appendix.insert(indices, start_padding);
let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
self.add_appendix_ref(encoder, file_name, appendix_ref_offset, file_size)
--
2.39.2
More information about the pbs-devel
mailing list