[pbs-devel] [PATCH v3 proxmox-backup 43/58] client: pxar: implement store to insert chunks on caching

Christian Ebner c.ebner at proxmox.com
Thu Mar 28 13:36:52 CET 2024


In preparation for the look-ahead caching used to temprarily store
entries before encoding them in the pxar archive, being able to
decide wether to re-use or re-encode regular file entries.

Allows to insert and store reused chunks in the archiver,
deduplicating chunks upon insert when possible.

Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
---
changes since version 2:
- Strongly adapted and refactored: keep track also of paddings
  introduced by reusing the chunks, making a suggestion whether to
  re-use, re-encode or check next entry based on threshold
- completely removed code which allowed to calculate offsets based on
  chunks found in the middle, they must either be a continuation of the
  end or be added after, otherwise offsets are not monotonically
  increasing, which is required for sequential restore

 pbs-client/src/pxar/create.rs | 126 +++++++++++++++++++++++++++++++++-
 1 file changed, 125 insertions(+), 1 deletion(-)

diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index 335e3556f..95a91a59b 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -20,7 +20,7 @@ use pathpatterns::{MatchEntry, MatchFlag, MatchList, MatchType, PatternFlag};
 use pbs_datastore::index::IndexFile;
 use proxmox_sys::error::SysError;
 use pxar::accessor::aio::Accessor;
-use pxar::encoder::{LinkOffset, SeqWrite};
+use pxar::encoder::{LinkOffset, PayloadOffset, SeqWrite};
 use pxar::Metadata;
 
 use proxmox_io::vec;
@@ -36,6 +36,128 @@ use crate::pxar::metadata::errno_is_unsupported;
 use crate::pxar::tools::assert_single_path_component;
 use crate::pxar::Flags;
 
+const CHUNK_PADDING_THRESHOLD: f64 = 0.1;
+
+#[derive(Default)]
+struct ReusedChunks {
+    start_boundary: PayloadOffset,
+    total: PayloadOffset,
+    padding: u64,
+    chunks: Vec<(u64, ReusableDynamicEntry)>,
+    must_flush_first: bool,
+    suggestion: Suggested,
+}
+
+#[derive(Copy, Clone, Default)]
+enum Suggested {
+    #[default]
+    CheckNext,
+    Reuse,
+    Reencode,
+}
+
+impl ReusedChunks {
+    fn new() -> Self {
+        Self::default()
+    }
+
+    fn start_boundary(&self) -> PayloadOffset {
+        self.start_boundary
+    }
+
+    fn is_empty(&self) -> bool {
+        self.chunks.is_empty()
+    }
+
+    fn suggested(&self) -> Suggested {
+        self.suggestion
+    }
+
+    fn insert(
+        &mut self,
+        indices: Vec<ReusableDynamicEntry>,
+        boundary: PayloadOffset,
+        start_padding: u64,
+        end_padding: u64,
+    ) -> PayloadOffset {
+        if self.is_empty() {
+            self.start_boundary = boundary;
+        }
+
+        if let Some(offset) = self.last_digest_matched(&indices) {
+            if let Some((padding, last)) = self.chunks.last_mut() {
+                // Existing chunk, update padding based on pre-existing one
+                // Start padding is expected to be larger than previous padding
+                *padding += start_padding - last.size();
+                self.padding += start_padding - last.size();
+            }
+
+            for chunk in indices.into_iter().skip(1) {
+                self.total = self.total.add(chunk.size());
+                self.chunks.push((0, chunk));
+            }
+
+            if let Some((padding, _last)) = self.chunks.last_mut() {
+                *padding += end_padding;
+                self.padding += end_padding;
+            }
+
+            let padding_ratio = self.padding as f64 / self.total.raw() as f64;
+            if self.chunks.len() > 1 && padding_ratio < CHUNK_PADDING_THRESHOLD {
+                self.suggestion = Suggested::Reuse;
+            }
+
+            self.start_boundary.add(offset + start_padding)
+        } else {
+            let offset = self.total.raw();
+
+            if let Some(first) = indices.first() {
+                self.total = self.total.add(first.size());
+                self.chunks.push((start_padding, first.clone()));
+                // New chunk, all start padding counts
+                self.padding += start_padding;
+            }
+
+            for chunk in indices.into_iter().skip(1) {
+                self.total = self.total.add(chunk.size());
+                self.chunks.push((chunk.size(), chunk));
+            }
+
+            if let Some((padding, _last)) = self.chunks.last_mut() {
+                *padding += end_padding;
+                self.padding += end_padding;
+            }
+
+            if self.chunks.len() > 2 {
+                let padding_ratio = self.padding as f64 / self.total.raw() as f64;
+                if padding_ratio < CHUNK_PADDING_THRESHOLD {
+                    self.suggestion = Suggested::Reuse;
+                } else {
+                    self.suggestion = Suggested::Reencode;
+                }
+            }
+
+            self.start_boundary.add(offset + start_padding)
+        }
+    }
+
+    fn last_digest_matched(&self, indices: &[ReusableDynamicEntry]) -> Option<u64> {
+        let digest = if let Some(first) = indices.first() {
+            first.digest()
+        } else {
+            return None;
+        };
+
+        if let Some(last) = self.chunks.last() {
+            if last.1.digest() == digest {
+                return Some(self.total.raw() - last.1.size());
+            }
+        }
+
+        None
+    }
+}
+
 /// Pxar options for creating a pxar archive/stream
 #[derive(Default, Clone)]
 pub struct PxarCreateOptions {
@@ -147,6 +269,7 @@ struct Archiver {
     hardlinks: HashMap<HardLinkInfo, (PathBuf, LinkOffset)>,
     file_copy_buffer: Vec<u8>,
     skip_e2big_xattr: bool,
+    reused_chunks: ReusedChunks,
     forced_boundaries: Option<Arc<Mutex<VecDeque<InjectChunks>>>>,
 }
 
@@ -239,6 +362,7 @@ where
         hardlinks: HashMap::new(),
         file_copy_buffer: vec::undefined(4 * 1024 * 1024),
         skip_e2big_xattr: options.skip_e2big_xattr,
+        reused_chunks: ReusedChunks::new(),
         forced_boundaries,
     };
 
-- 
2.39.2





More information about the pbs-devel mailing list