[pbs-devel] [PATCH v4 proxmox-backup 21/26] fix #3174: client: Add detection mode to backup creation

Christian Ebner c.ebner at proxmox.com
Thu Nov 9 19:46:09 CET 2023


Introduces the `change-detection-mode` parameter to change file
encoding behavior.

When set to `metadata`, the catalog for the previous backup run and
the corresponding index file are fetched from the server and used as
reference during pxar archive creation.
This allows the archiver to skip encoding of file payloads for
unchanged regular files and referencing their existing chunks to be
included in the new backups index file instead, creating a pxar
archive with appendix section containing the payloads as concatenation
of chunks.

Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
---
Changes since version 3:
- refactor appendix section creation code

Changes since version 2:
- Fix issue with reference catalog and index download when either the
  backup group contains no snapshots or the snapshot does not contain an
  archive with the given name.

Changes since version 1:
- Replace `incremental` flag with `change-detection-mode` param

 pbs-client/src/pxar/create.rs     | 201 ++++++++++++++++--------------
 proxmox-backup-client/src/main.rs | 133 ++++++++++++++++++--
 2 files changed, 230 insertions(+), 104 deletions(-)

diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index d369ed8e..8b283b23 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -17,7 +17,7 @@ use nix::sys::stat::{FileStat, Mode};
 
 use pathpatterns::{MatchEntry, MatchFlag, MatchList, MatchType, PatternFlag};
 use proxmox_sys::error::SysError;
-use pxar::encoder::{LinkOffset, SeqSink, SeqWrite};
+use pxar::encoder::{AppendixRefOffset, LinkOffset, SeqSink, SeqWrite};
 use pxar::Metadata;
 
 use proxmox_io::vec;
@@ -134,6 +134,67 @@ struct HardLinkInfo {
     st_ino: u64,
 }
 
+struct Appendix {
+    total: AppendixRefOffset,
+    chunks: Vec<DynamicEntry>,
+}
+
+impl Appendix {
+    fn new() -> Self {
+        Self {
+            total: AppendixRefOffset::default(),
+            chunks: Vec::new(),
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.chunks.is_empty()
+    }
+
+    fn insert(&mut self, indices: Vec<DynamicEntry>, start_padding: u64) -> AppendixRefOffset {
+        if let Some(offset) = self.digest_sequence_contained(&indices) {
+            AppendixRefOffset::default().add(offset + start_padding)
+        } else {
+            let offset = self.total;
+            for chunk in indices.into_iter() {
+                self.total = self.total.add(chunk.end());
+                self.chunks.push(chunk);
+            }
+            offset.add(start_padding)
+        }
+    }
+
+    fn digest_sequence_contained(&self, indices: &[DynamicEntry]) -> Option<u64> {
+        let digest = if let Some(first) = indices.first() {
+            first.digest()
+        } else {
+            return None;
+        };
+
+        let mut offset = 0;
+        let mut iter = self.chunks.iter();
+        while let Some(position) = iter.position(|e| {
+            offset += e.end();
+            e.digest() == digest
+        }) {
+            if indices.len() + position > self.chunks.len() {
+                return None;
+            }
+
+            for (ind, chunk) in indices.iter().skip(1).enumerate() {
+                if chunk.digest() != self.chunks[ind + position].digest() {
+                    break;
+                }
+            }
+
+            offset -= self.chunks[position].end();
+            return Some(offset);
+        }
+
+        None
+    }
+}
+
 struct Archiver {
     feature_flags: Flags,
     fs_feature_flags: Flags,
@@ -151,7 +212,7 @@ struct Archiver {
     file_copy_buffer: Vec<u8>,
     previous_ref: Option<PxarPrevRef>,
     forced_boundaries: Arc<Mutex<VecDeque<InjectChunks>>>,
-    inject: (pxar::encoder::AppendixRefOffset, Vec<DynamicEntry>),
+    appendix: Appendix,
     prev_appendix: Option<AppendixStartOffset>,
 }
 
@@ -236,7 +297,7 @@ where
         file_copy_buffer: vec::undefined(4 * 1024 * 1024),
         previous_ref: options.previous_ref,
         forced_boundaries,
-        inject: (pxar::encoder::AppendixRefOffset::default(), Vec::new()),
+        appendix: Appendix::new(),
         prev_appendix: appendix_start,
     };
 
@@ -253,7 +314,14 @@ where
         .archive_dir_contents(&mut encoder, source_dir, prev_cat_parent.as_ref(), true)
         .await?;
 
-    if archiver.inject.1.len() > 0 {
+    if archiver.appendix.is_empty() {
+        encoder.finish(None).await?;
+        if let Some(ref mut catalog) = archiver.catalog {
+            if options.archive_name.is_some() {
+                catalog.lock().unwrap().end_archive(None)?;
+            }
+        }
+    } else {
         let (appendix_offset, appendix_size) = archiver.add_appendix(&mut encoder).await?;
         encoder
             .finish(Some((appendix_offset, appendix_size)))
@@ -263,13 +331,6 @@ where
                 catalog.lock().unwrap().end_archive(Some(appendix_offset))?;
             }
         }
-    } else {
-        encoder.finish(None).await?;
-        if let Some(ref mut catalog) = archiver.catalog {
-            if options.archive_name.is_some() {
-                catalog.lock().unwrap().end_archive(None)?;
-            }
-        }
     }
 
     Ok(())
@@ -599,24 +660,20 @@ impl Archiver {
     async fn add_appendix<T: SeqWrite + Send>(
         &mut self,
         encoder: &mut Encoder<'_, T>,
-    ) -> Result<(pxar::encoder::AppendixStartOffset, u64), Error> {
-        let total = self
-            .inject
-            .1
-            .iter()
-            .fold(0, |sum, inject| sum + inject.end());
+    ) -> Result<(pxar::encoder::AppendixStartOffset, AppendixRefOffset), Error> {
+        let total = self.appendix.total;
         let appendix_offset = encoder.add_appendix(total).await?;
         let mut boundaries = self.forced_boundaries.lock().unwrap();
         let mut position = unsafe { encoder.position_add(0) };
 
         // Inject reused chunks in patches of 128 to not exceed upload post req size limit
-        for injects in self.inject.1.chunks(128) {
-            let size = injects
+        for chunks in self.appendix.chunks.chunks(128) {
+            let size = chunks
                 .iter()
-                .fold(0, |sum, inject| sum + inject.end() as usize);
+                .fold(0, |sum, chunk| sum + chunk.end() as usize);
             let inject_chunks = InjectChunks {
                 boundary: position,
-                chunks: injects.to_vec(),
+                chunks: chunks.to_vec(),
                 size,
             };
             boundaries.push_back(inject_chunks);
@@ -626,44 +683,6 @@ impl Archiver {
         Ok((appendix_offset, total))
     }
 
-    fn digest_sequence_contained(
-        indices: &[DynamicEntry],
-        appendix_indices: &[DynamicEntry],
-    ) -> Option<u64> {
-        let digest = if let Some(first) = indices.first() {
-            first.digest()
-        } else {
-            return None;
-        };
-
-        let mut appendix_offset = 0;
-        let mut appendix_iter = appendix_indices.iter();
-        while let Some(position) = appendix_iter.position(|e| {
-            appendix_offset += e.end();
-            e.digest() == digest
-        }) {
-            if indices.len() + position > appendix_indices.len() {
-                // There are not enough remaining chunks, so this will never match
-                return None;
-            }
-
-            // Skip first digest comparison, as this is already covered
-            for (ind, chunk) in indices.iter().skip(1).enumerate() {
-                if chunk.digest() != appendix_indices[ind + position].digest() {
-                    // Not all chunks in the sequence matched,
-                    // try with next position is there is one
-                    break;
-                }
-            }
-            // Remove added value of the while loop again
-            appendix_offset -= appendix_indices[position].end();
-            // All chunks matched the sequence
-            return Some(appendix_offset);
-        }
-
-        None
-    }
-
     async fn reuse_if_metadata_unchanged<T: SeqWrite + Send>(
         &mut self,
         encoder: &mut Encoder<'_, T>,
@@ -711,47 +730,37 @@ impl Archiver {
         };
 
         let file_size = stat.st_size as u64;
-        if ctime == stat.st_ctime && size == file_size {
-            // Since ctime is unchanged, use current metadata size to calculate size and thereby
-            // end offset for the file payload in the reference archive.
-            // This is required to find the existing indexes to be included in the new index file.
-            let mut bytes = pxar::encoder::Encoder::<SeqSink>::byte_len(c_file_name, metadata)?;
-            // payload header size
-            bytes += std::mem::size_of::<pxar::format::Header>() as u64;
-
-            let end_offset = start_offset + bytes + file_size;
-            let (indices, padding_start) = prev_ref.index.indices(start_offset, end_offset)?;
-
-            let appendix_ref_offset = if let Some(off) = Archiver::digest_sequence_contained(&indices, &self.inject.1) {
-                let appendix_ref_offset = pxar::encoder::AppendixRefOffset::default();
-                appendix_ref_offset.add(off + padding_start)
-            } else {
-                let current_end_offset = self.inject.0;
-                for chunk in indices.into_iter() {
-                    self.inject.0 = self.inject.0.add(chunk.end());
-                    self.inject.1.push(chunk);
-                }
-                current_end_offset.add(padding_start)
-            };
+        if ctime != stat.st_ctime || size != file_size {
+            return Ok(false);
+        }
 
-            let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
-            self.add_appendix_ref(encoder, file_name, appendix_ref_offset, file_size)
-                .await?;
-
-            if let Some(ref catalog) = self.catalog {
-                catalog.lock().unwrap().add_appendix_ref(
-                    c_file_name,
-                    file_size,
-                    stat.st_mtime,
-                    stat.st_ctime,
-                    appendix_ref_offset,
-                )?;
-            }
+        // Since ctime is unchanged, use current metadata size to calculate size and thereby
+        // end offset for the file payload in the reference archive.
+        // This is required to find the existing indexes to be included in the new index file.
+        let mut bytes = pxar::encoder::Encoder::<SeqSink>::byte_len(c_file_name, metadata)?;
+        // payload header size
+        bytes += std::mem::size_of::<pxar::format::Header>() as u64;
+
+        let end_offset = start_offset + bytes + file_size;
+        let (indices, start_padding, end_padding) =
+            prev_ref.index.indices(start_offset, end_offset)?;
 
-            return Ok(true);
+        let appendix_ref_offset = self.appendix.insert(indices, start_padding);
+        let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
+        self.add_appendix_ref(encoder, file_name, appendix_ref_offset, file_size)
+            .await?;
+
+        if let Some(ref catalog) = self.catalog {
+            catalog.lock().unwrap().add_appendix_ref(
+                c_file_name,
+                file_size,
+                stat.st_mtime,
+                stat.st_ctime,
+                appendix_ref_offset,
+            )?;
         }
 
-        Ok(false)
+        Ok(true)
     }
 
     async fn add_entry<T: SeqWrite + Send>(
diff --git a/proxmox-backup-client/src/main.rs b/proxmox-backup-client/src/main.rs
index cbdd9f43..e986716b 100644
--- a/proxmox-backup-client/src/main.rs
+++ b/proxmox-backup-client/src/main.rs
@@ -1,5 +1,6 @@
 use std::collections::{HashSet, VecDeque};
 use std::io::{self, Read, Seek, SeekFrom, Write};
+use std::os::unix::fs::OpenOptionsExt;
 use std::path::{Path, PathBuf};
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
@@ -43,10 +44,10 @@ use pbs_client::tools::{
     CHUNK_SIZE_SCHEMA, REPO_URL_SCHEMA,
 };
 use pbs_client::{
-    delete_ticket_info, parse_backup_specification, view_task_result, BackupReader,
-    BackupRepository, BackupSpecificationType, BackupStats, BackupWriter, ChunkStream,
-    FixedChunkStream, HttpClient, PxarBackupStream, RemoteChunkReader, UploadOptions,
-    BACKUP_SOURCE_SCHEMA,
+    delete_ticket_info, parse_backup_detection_mode_specification, parse_backup_specification,
+    view_task_result, BackupDetectionMode, BackupReader, BackupRepository, BackupSpecificationType,
+    BackupStats, BackupWriter, ChunkStream, FixedChunkStream, HttpClient, PxarBackupStream,
+    RemoteChunkReader, UploadOptions, BACKUP_DETECTION_MODE_SPEC, BACKUP_SOURCE_SCHEMA,
 };
 use pbs_datastore::catalog::{CatalogReader, CatalogWriter};
 use pbs_datastore::chunk_store::verify_chunk_size;
@@ -666,6 +667,10 @@ fn spawn_catalog_upload(
                schema: TRAFFIC_CONTROL_BURST_SCHEMA,
                optional: true,
            },
+           "change-detection-mode": {
+               schema: BACKUP_DETECTION_MODE_SPEC,
+               optional: true,
+           },
            "exclude": {
                type: Array,
                description: "List of paths or patterns for matching files to exclude.",
@@ -849,7 +854,20 @@ async fn create_backup(
 
     let backup_time = backup_time_opt.unwrap_or_else(epoch_i64);
 
-    let client = connect_rate_limited(&repo, rate_limit)?;
+    let cd_mode = param["change-detection-mode"].as_str().unwrap_or("data");
+    let detection_mode = parse_backup_detection_mode_specification(cd_mode)?;
+
+    let client = connect_rate_limited(&repo, rate_limit.clone())?;
+    let backup_group = BackupGroup::new(backup_type, backup_id);
+
+    let previous_snapshot = if let BackupDetectionMode::Metadata(_) = detection_mode {
+        api_datastore_latest_snapshot(&client, &repo.store(), &backup_ns, backup_group)
+            .await
+            .ok()
+    } else {
+        None
+    };
+
     record_repository(&repo);
 
     let snapshot = BackupDir::from((backup_type, backup_id.to_owned(), backup_time));
@@ -959,8 +977,8 @@ async fn create_backup(
         log::info!("{} {} '{}' to '{}' as {}", what, desc, file, repo, target);
     };
 
-    for (backup_type, filename, target, size) in upload_list {
-        match (backup_type, dry_run) {
+    for (backup_spec_type, filename, target, size) in upload_list {
+        match (backup_spec_type, dry_run) {
             // dry-run
             (BackupSpecificationType::CONFIG, true) => log_file("config file", &filename, &target),
             (BackupSpecificationType::LOGFILE, true) => log_file("log file", &filename, &target),
@@ -1006,12 +1024,62 @@ async fn create_backup(
 
                 log_file("directory", &filename, &target);
 
+                let known_chunks = Arc::new(Mutex::new(HashSet::new()));
+                let previous_ref =
+                    if let BackupDetectionMode::Metadata(ref archives) = detection_mode {
+                        if archives.is_empty() || archives.contains(&target) {
+                            match previous_manifest {
+                                None => {
+                                    log::info!("No previous manifest, fallback to regular mode");
+                                    None
+                                }
+                                Some(ref manifest) => {
+                                    let reference_index = client
+                                        .download_previous_dynamic_index(
+                                            &target,
+                                            &manifest,
+                                            known_chunks.clone(),
+                                        )
+                                        .await
+                                        .ok();
+                                    let reference_catalog = download_reference_catalog(
+                                        &repo,
+                                        previous_snapshot.as_ref().unwrap(),
+                                        &backup_ns,
+                                        crypt_config.clone(),
+                                    )
+                                    .await
+                                    .ok();
+
+                                    match  (reference_index, reference_catalog) {
+                                        (Some(reference_index), Some(reference_catalog)) => {
+                                            log::info!(
+                                                "Using previous catalog as metadata reference for '{target}'"
+                                            );
+
+                                            Some(pbs_client::pxar::PxarPrevRef {
+                                                index: reference_index,
+                                                catalog: reference_catalog,
+                                                archive_name: target.clone(),
+                                            })
+                                        }
+                                        _ => None,
+                                    }
+                                }
+                            }
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    };
+
                 let pxar_options = pbs_client::pxar::PxarCreateOptions {
                     device_set: devices.clone(),
                     patterns: pattern_list.clone(),
                     entries_max: entries_max as usize,
                     skip_lost_and_found,
-                    previous_ref: None,
+                    previous_ref,
                     archive_name: Some(std::ffi::CString::new(target.as_str())?),
                 };
 
@@ -1112,6 +1180,55 @@ async fn create_backup(
     Ok(Value::Null)
 }
 
+async fn download_reference_catalog(
+    repo: &BackupRepository,
+    previous_snapshot: &BackupDir,
+    backup_ns: &BackupNamespace,
+    crypt_config: Option<Arc<CryptConfig>>,
+) -> Result<CatalogReader<std::fs::File>, Error> {
+    let http_reader_client = connect(&repo)?;
+    let backup_reader = BackupReader::start(
+        http_reader_client,
+        crypt_config.clone(),
+        repo.store(),
+        &backup_ns,
+        &previous_snapshot,
+        true,
+    )
+    .await?;
+
+    let (manifest, _) = backup_reader.download_manifest().await?;
+    manifest.check_fingerprint(crypt_config.as_ref().map(Arc::as_ref))?;
+
+    let index = backup_reader
+        .download_dynamic_index(&manifest, CATALOG_NAME)
+        .await?;
+    let most_used = index.find_most_used_chunks(8);
+    let file_info = manifest.lookup_file_info(CATALOG_NAME)?;
+
+    let chunk_reader = RemoteChunkReader::new(
+        backup_reader,
+        crypt_config.clone(),
+        file_info.chunk_crypt_mode(),
+        most_used,
+    );
+
+    let mut reader = BufferedDynamicReader::new(index, chunk_reader);
+
+    let mut catalogfile = std::fs::OpenOptions::new()
+        .write(true)
+        .read(true)
+        .custom_flags(libc::O_TMPFILE)
+        .open("/tmp")?;
+
+    std::io::copy(&mut reader, &mut catalogfile)
+        .map_err(|err| format_err!("failed to download reference catalog - {}", err))?;
+
+    catalogfile.seek(SeekFrom::Start(0))?;
+
+    Ok(CatalogReader::new(catalogfile))
+}
+
 async fn dump_image<W: Write>(
     client: Arc<BackupReader>,
     crypt_config: Option<Arc<CryptConfig>>,
-- 
2.39.2






More information about the pbs-devel mailing list