[pbs-devel] [RFC proxmox-backup 19/20] fix #3174: archiver: reuse files with unchanged metadata

Christian Ebner c.ebner at proxmox.com
Fri Sep 22 09:16:20 CEST 2023


During pxar archive encoding, check regular files against their
previous backup catalogs metadata, if present.

Instead of re-encoding files with unchanged metadata with file size over
a given threshold limit, mark the entries as appendix references in the
pxar archive and append the chunks containing the file payload in the
appendix.

Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
---
 pbs-client/src/pxar/create.rs             | 149 +++++++++++++++++++++-
 src/tape/file_formats/snapshot_archive.rs |   2 +-
 2 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index d6afc465..cb9af26f 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -24,7 +24,7 @@ use proxmox_io::vec;
 use proxmox_lang::c_str;
 use proxmox_sys::fs::{self, acl, xattr};
 
-use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader};
+use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader, DirEntryAttribute};
 use pbs_datastore::dynamic_index::{DynamicEntry, DynamicIndexReader};
 
 use crate::inject_reused_chunks::InjectChunks;
@@ -32,6 +32,8 @@ use crate::pxar::metadata::errno_is_unsupported;
 use crate::pxar::tools::assert_single_path_component;
 use crate::pxar::Flags;
 
+const MAX_FILE_SIZE: u64 = 128;
+
 /// Pxar options for creating a pxar archive/stream
 #[derive(Default)]
 pub struct PxarCreateOptions {
@@ -218,7 +220,14 @@ where
     archiver
         .archive_dir_contents(&mut encoder, source_dir, true)
         .await?;
-    encoder.finish().await?;
+
+    if archiver.inject.1.len() > 0 {
+        let (appendix_offset, appendix_size) = archiver.add_appendix(&mut encoder).await?;
+        encoder.finish(Some((appendix_offset, appendix_size))).await?;
+    } else {
+        encoder.finish(None).await?;
+    }
+
     Ok(())
 }
 
@@ -529,6 +538,132 @@ impl Archiver {
         Ok(())
     }
 
+    async fn add_appendix<T: SeqWrite + Send>(
+        &mut self,
+        encoder: &mut Encoder<'_, T>,
+    ) -> Result<(LinkOffset, u64), Error> {
+        let total = self
+            .inject
+            .1
+            .iter()
+            .fold(0, |sum, inject| sum + inject.end());
+        let appendix_offset = encoder.add_appendix(total).await?;
+        let mut boundaries = self.forced_boundaries.lock().unwrap();
+        let mut position = encoder.position_add(0);
+
+        // Inject reused chunks in patches of 128 to not exceed upload post req size limit
+        for injects in self.inject.1.chunks(128) {
+            let size = injects
+                .iter()
+                .fold(0, |sum, inject| sum + inject.end() as usize);
+            let inject_chunks = InjectChunks {
+                boundary: position,
+                chunks: injects.to_vec(),
+                size,
+            };
+            boundaries.push_back(inject_chunks);
+            position = encoder.position_add(size as u64);
+        }
+
+        Ok((appendix_offset, total))
+    }
+
+    async fn reuse_if_metadata_unchanged<T: SeqWrite + Send>(
+        &mut self,
+        encoder: &mut Encoder<'_, T>,
+        c_file_name: &CStr,
+        metadata: &Metadata,
+        stat: &FileStat,
+    ) -> Result<bool, Error> {
+        let prev_ref = match self.previous_ref {
+            None => return Ok(false),
+            Some(ref mut prev_ref) => prev_ref
+        };
+
+        let path = Path::new(prev_ref.archive_name.as_str()).join(self.path.clone());
+        let catalog_entry = prev_ref
+            .catalog
+            .lookup_recursive(path.as_os_str().as_bytes())?;
+
+        match catalog_entry.attr {
+            DirEntryAttribute::File {
+                size,
+                mtime,
+                link_offset,
+            } => {
+                let file_size = stat.st_size as u64;
+                if mtime == stat.st_mtime && size == file_size {
+                    if let Some(ref catalog) = self.catalog {
+                        catalog.lock().unwrap().add_file(
+                            c_file_name,
+                            file_size,
+                            stat.st_mtime,
+                            link_offset,
+                        )?;
+                    }
+
+                    // Filename header
+                    let mut metadata_bytes = std::mem::size_of::<pxar::format::Header>();
+                    // Filename payload
+                    metadata_bytes += std::mem::size_of_val(c_file_name);
+                    // Metadata with headers and payloads
+                    metadata_bytes += metadata.calculate_byte_len();
+                    // Payload header
+                    metadata_bytes += std::mem::size_of::<pxar::format::Header>();
+
+                    let metadata_bytes = u64::try_from(metadata_bytes)?;
+                    let chunk_start_offset = link_offset.raw();
+                    let start = chunk_start_offset;
+                    let end = chunk_start_offset + metadata_bytes + file_size;
+                    let (indices, total_size, padding_start) =
+                        prev_ref.index.indices(start, end)?;
+
+                    let mut appendix_offset = self.inject.0 as u64 + padding_start;
+
+                    if let (Some(current_end), Some(new_start)) =
+                        (self.inject.1.last(), indices.first())
+                    {
+                        if new_start.digest() == current_end.digest() {
+                            // Already got that chunk, do not append it again and correct
+                            // appendix_offset to be relative to chunk before this one
+                            appendix_offset -= new_start.end();
+                            if indices.len() > 1 {
+                                // Append all following chunks
+                                self.inject.0 += indices[1..]
+                                    .iter()
+                                    .fold(0, |sum, index| sum + index.end() as usize);
+                                self.inject.1.extend_from_slice(&indices[1..]);
+                            }
+                        }
+                    } else {
+                        self.inject.0 += total_size;
+                        self.inject.1.extend_from_slice(&indices);
+                    }
+
+                    let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
+                    let _offset = self
+                        .add_appendix_ref(
+                            encoder,
+                            file_name,
+                            &metadata,
+                            appendix_offset,
+                            file_size,
+                        )
+                        .await?;
+
+                    return Ok(true);
+                }
+            }
+            DirEntryAttribute::Hardlink => {
+                // Catalog contains a hardlink, but the hard link was not present in the current
+                // pxar archive. So be sure to reencode this file instead of reusing it.
+                return Ok(false)
+            }
+            _ => println!("Unexpected attribute type, expected 'File' or 'Hardlink'"),
+        }
+        Ok(false)
+    }
+
     async fn add_entry<T: SeqWrite + Send>(
         &mut self,
         encoder: &mut Encoder<'_, T>,
@@ -595,6 +730,14 @@ impl Archiver {
                 }
 
                 let file_size = stat.st_size as u64;
+                if file_size > MAX_FILE_SIZE
+                    && self
+                        .reuse_if_metadata_unchanged(encoder, c_file_name, &metadata, stat)
+                        .await?
+                {
+                    return Ok(());
+                }
+
                 let offset: LinkOffset = self
                     .add_regular_file(encoder, fd, file_name, &metadata, file_size)
                     .await?;
@@ -712,7 +855,7 @@ impl Archiver {
         self.fs_feature_flags = old_fs_feature_flags;
         self.current_st_dev = old_st_dev;
 
-        encoder.finish().await?;
+        encoder.finish(None).await?;
         result
     }
 
diff --git a/src/tape/file_formats/snapshot_archive.rs b/src/tape/file_formats/snapshot_archive.rs
index 252384b5..4bbf4727 100644
--- a/src/tape/file_formats/snapshot_archive.rs
+++ b/src/tape/file_formats/snapshot_archive.rs
@@ -88,7 +88,7 @@ pub fn tape_write_snapshot_archive<'a>(
                 proxmox_lang::io_bail!("file '{}' shrunk while reading", filename);
             }
         }
-        encoder.finish()?;
+        encoder.finish(None)?;
         Ok(())
     });
 
-- 
2.39.2






More information about the pbs-devel mailing list