[pbs-devel] [PATCH proxmox-backup 1/3] fix #3786: api: add resync-corrupt option to sync jobs

Gabriel Goller g.goller at proxmox.com
Tue Oct 15 15:18:21 CEST 2024


This option allows us to "fix" corrupt snapshots (and/or their chunks)
by pulling them from another remote. When traversing the remote
snapshots, we check if it exists locally, and if it is, we check if the
last verification of it failed. If the local snapshot is broken and the
`resync-corrupt` option is turned on, we pull in the remote snapshot,
overwriting the local one.

This is very useful and has been requested a lot, as there is currently
no way to "fix" corrupt chunks/snapshots even if the user has a healthy
version of it on their offsite instance.

Signed-off-by: Gabriel Goller <g.goller at proxmox.com>
Originally-by: Shannon Sterz <s.sterz at proxmox.com>
---
 pbs-api-types/src/jobs.rs         | 11 +++++
 pbs-datastore/src/backup_info.rs  | 13 +++++-
 src/api2/config/sync.rs           |  4 ++
 src/api2/pull.rs                  |  9 +++-
 src/bin/proxmox-backup-manager.rs |  4 +-
 src/server/pull.rs                | 68 +++++++++++++++++++++----------
 6 files changed, 84 insertions(+), 25 deletions(-)

diff --git a/pbs-api-types/src/jobs.rs b/pbs-api-types/src/jobs.rs
index 868702bc059e..a3cd68bf5afd 100644
--- a/pbs-api-types/src/jobs.rs
+++ b/pbs-api-types/src/jobs.rs
@@ -498,6 +498,11 @@ pub const TRANSFER_LAST_SCHEMA: Schema =
         .minimum(1)
         .schema();
 
+pub const RESYNC_CORRUPT_SCHEMA: Schema = BooleanSchema::new(
+    "If the verification failed for a local snapshot, try to pull its chunks again.",
+)
+.schema();
+
 #[api(
     properties: {
         id: {
@@ -552,6 +557,10 @@ pub const TRANSFER_LAST_SCHEMA: Schema =
             schema: TRANSFER_LAST_SCHEMA,
             optional: true,
         },
+        "resync-corrupt": {
+            schema: RESYNC_CORRUPT_SCHEMA,
+            optional: true,
+        }
     }
 )]
 #[derive(Serialize, Deserialize, Clone, Updater, PartialEq)]
@@ -585,6 +594,8 @@ pub struct SyncJobConfig {
     pub limit: RateLimitConfig,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub transfer_last: Option<usize>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub resync_corrupt: Option<bool>,
 }
 
 impl SyncJobConfig {
diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_info.rs
index 414ec878d01a..2d283dbe5cd9 100644
--- a/pbs-datastore/src/backup_info.rs
+++ b/pbs-datastore/src/backup_info.rs
@@ -8,7 +8,8 @@ use anyhow::{bail, format_err, Error};
 use proxmox_sys::fs::{lock_dir_noblock, replace_file, CreateOptions};
 
 use pbs_api_types::{
-    Authid, BackupNamespace, BackupType, GroupFilter, BACKUP_DATE_REGEX, BACKUP_FILE_REGEX,
+    Authid, BackupNamespace, BackupType, GroupFilter, SnapshotVerifyState, VerifyState,
+    BACKUP_DATE_REGEX, BACKUP_FILE_REGEX,
 };
 use pbs_config::{open_backup_lockfile, BackupLockGuard};
 
@@ -583,6 +584,16 @@ impl BackupDir {
 
         Ok(())
     }
+
+    /// Load the verify state from the manifest.
+    pub fn verify_state(&self) -> Option<VerifyState> {
+        self.load_manifest().ok().and_then(|(m, _)| {
+            let verify = m.unprotected["verify_state"].clone();
+            serde_json::from_value::<SnapshotVerifyState>(verify)
+                .ok()
+                .map(|svs| svs.state)
+        })
+    }
 }
 
 impl AsRef<pbs_api_types::BackupNamespace> for BackupDir {
diff --git a/src/api2/config/sync.rs b/src/api2/config/sync.rs
index 6fdc69a9e645..fa9db92f3d11 100644
--- a/src/api2/config/sync.rs
+++ b/src/api2/config/sync.rs
@@ -368,6 +368,9 @@ pub fn update_sync_job(
     if let Some(transfer_last) = update.transfer_last {
         data.transfer_last = Some(transfer_last);
     }
+    if let Some(resync_corrupt) = update.resync_corrupt {
+        data.resync_corrupt = Some(resync_corrupt);
+    }
 
     if update.limit.rate_in.is_some() {
         data.limit.rate_in = update.limit.rate_in;
@@ -527,6 +530,7 @@ acl:1:/remote/remote1/remotestore1:write at pbs:RemoteSyncOperator
         ns: None,
         owner: Some(write_auth_id.clone()),
         comment: None,
+        resync_corrupt: None,
         remove_vanished: None,
         max_depth: None,
         group_filter: None,
diff --git a/src/api2/pull.rs b/src/api2/pull.rs
index e733c9839e3a..0d4be0e2d228 100644
--- a/src/api2/pull.rs
+++ b/src/api2/pull.rs
@@ -10,7 +10,7 @@ use pbs_api_types::{
     Authid, BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA,
     GROUP_FILTER_LIST_SCHEMA, NS_MAX_DEPTH_REDUCED_SCHEMA, PRIV_DATASTORE_BACKUP,
     PRIV_DATASTORE_PRUNE, PRIV_REMOTE_READ, REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA,
-    TRANSFER_LAST_SCHEMA,
+    RESYNC_CORRUPT_SCHEMA, TRANSFER_LAST_SCHEMA,
 };
 use pbs_config::CachedUserInfo;
 use proxmox_human_byte::HumanByte;
@@ -89,6 +89,7 @@ impl TryFrom<&SyncJobConfig> for PullParameters {
             sync_job.group_filter.clone(),
             sync_job.limit.clone(),
             sync_job.transfer_last,
+            sync_job.resync_corrupt,
         )
     }
 }
@@ -240,6 +241,10 @@ pub fn do_sync_job(
                 schema: TRANSFER_LAST_SCHEMA,
                 optional: true,
             },
+            "resync-corrupt": {
+                schema: RESYNC_CORRUPT_SCHEMA,
+                optional: true,
+            },
         },
     },
     access: {
@@ -264,6 +269,7 @@ async fn pull(
     group_filter: Option<Vec<GroupFilter>>,
     limit: RateLimitConfig,
     transfer_last: Option<usize>,
+    resync_corrupt: Option<bool>,
     rpcenv: &mut dyn RpcEnvironment,
 ) -> Result<String, Error> {
     let auth_id: Authid = rpcenv.get_auth_id().unwrap().parse()?;
@@ -301,6 +307,7 @@ async fn pull(
         group_filter,
         limit,
         transfer_last,
+        resync_corrupt,
     )?;
 
     // fixme: set to_stdout to false?
diff --git a/src/bin/proxmox-backup-manager.rs b/src/bin/proxmox-backup-manager.rs
index 420e96665662..38a1cf0f5881 100644
--- a/src/bin/proxmox-backup-manager.rs
+++ b/src/bin/proxmox-backup-manager.rs
@@ -14,8 +14,8 @@ use pbs_api_types::percent_encoding::percent_encode_component;
 use pbs_api_types::{
     BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA,
     GROUP_FILTER_LIST_SCHEMA, IGNORE_VERIFIED_BACKUPS_SCHEMA, NS_MAX_DEPTH_SCHEMA,
-    REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, TRANSFER_LAST_SCHEMA, UPID_SCHEMA,
-    VERIFICATION_OUTDATED_AFTER_SCHEMA,
+    REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, RESYNC_CORRUPT_SCHEMA, TRANSFER_LAST_SCHEMA,
+    UPID_SCHEMA, VERIFICATION_OUTDATED_AFTER_SCHEMA,
 };
 use pbs_client::{display_task_log, view_task_result};
 use pbs_config::sync;
diff --git a/src/server/pull.rs b/src/server/pull.rs
index 3117f7d2c960..67881f83b5e3 100644
--- a/src/server/pull.rs
+++ b/src/server/pull.rs
@@ -12,7 +12,8 @@ use tracing::info;
 
 use pbs_api_types::{
     print_store_and_ns, Authid, BackupDir, BackupGroup, BackupNamespace, GroupFilter, Operation,
-    RateLimitConfig, Remote, MAX_NAMESPACE_DEPTH, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_BACKUP,
+    RateLimitConfig, Remote, VerifyState, MAX_NAMESPACE_DEPTH, PRIV_DATASTORE_AUDIT,
+    PRIV_DATASTORE_BACKUP,
 };
 use pbs_client::BackupRepository;
 use pbs_config::CachedUserInfo;
@@ -55,6 +56,8 @@ pub(crate) struct PullParameters {
     group_filter: Vec<GroupFilter>,
     /// How many snapshots should be transferred at most (taking the newest N snapshots)
     transfer_last: Option<usize>,
+    /// Whether to re-sync corrupted snapshots
+    resync_corrupt: bool,
 }
 
 impl PullParameters {
@@ -72,12 +75,14 @@ impl PullParameters {
         group_filter: Option<Vec<GroupFilter>>,
         limit: RateLimitConfig,
         transfer_last: Option<usize>,
+        resync_corrupt: Option<bool>,
     ) -> Result<Self, Error> {
         if let Some(max_depth) = max_depth {
             ns.check_max_depth(max_depth)?;
             remote_ns.check_max_depth(max_depth)?;
         };
         let remove_vanished = remove_vanished.unwrap_or(false);
+        let resync_corrupt = resync_corrupt.unwrap_or(false);
 
         let source: Arc<dyn SyncSource> = if let Some(remote) = remote {
             let (remote_config, _digest) = pbs_config::remote::config()?;
@@ -116,6 +121,7 @@ impl PullParameters {
             max_depth,
             group_filter,
             transfer_last,
+            resync_corrupt,
         })
     }
 }
@@ -175,9 +181,10 @@ async fn pull_index_chunks<I: IndexFile>(
                     target.cond_touch_chunk(&info.digest, false)
                 })?;
                 if chunk_exists {
-                    //info!("chunk {} exists {}", pos, hex::encode(digest));
+                    //info!("chunk exists {}", hex::encode(info.digest));
                     return Ok::<_, Error>(());
                 }
+
                 //info!("sync {} chunk {}", pos, hex::encode(digest));
                 let chunk = chunk_reader.read_raw_chunk(&info.digest).await?;
                 let raw_size = chunk.raw_size() as usize;
@@ -332,6 +339,7 @@ async fn pull_snapshot<'a>(
     reader: Arc<dyn SyncSourceReader + 'a>,
     snapshot: &'a pbs_datastore::BackupDir,
     downloaded_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
+    resync_corrupt: bool,
 ) -> Result<SyncStats, Error> {
     let mut sync_stats = SyncStats::default();
     let mut manifest_name = snapshot.full_path();
@@ -352,6 +360,8 @@ async fn pull_snapshot<'a>(
         return Ok(sync_stats);
     }
 
+    let snapshot_is_corrupt = snapshot.verify_state() == Some(VerifyState::Failed);
+
     if manifest_name.exists() {
         let manifest_blob = proxmox_lang::try_block!({
             let mut manifest_file = std::fs::File::open(&manifest_name).map_err(|err| {
@@ -365,7 +375,9 @@ async fn pull_snapshot<'a>(
             format_err!("unable to read local manifest {manifest_name:?} - {err}")
         })?;
 
-        if manifest_blob.raw_data() == tmp_manifest_blob.raw_data() {
+        if (manifest_blob.raw_data() == tmp_manifest_blob.raw_data())
+            && (!resync_corrupt && !snapshot_is_corrupt)
+        {
             if !client_log_name.exists() {
                 reader.try_download_client_log(&client_log_name).await?;
             };
@@ -381,7 +393,7 @@ async fn pull_snapshot<'a>(
         let mut path = snapshot.full_path();
         path.push(&item.filename);
 
-        if path.exists() {
+        if !(resync_corrupt && snapshot_is_corrupt) && path.exists() {
             match ArchiveType::from_path(&item.filename)? {
                 ArchiveType::DynamicIndex => {
                     let index = DynamicIndexReader::open(&path)?;
@@ -416,6 +428,10 @@ async fn pull_snapshot<'a>(
             }
         }
 
+        if resync_corrupt && snapshot_is_corrupt {
+            info!("pulling snapshot {} again", snapshot.dir());
+        }
+
         let stats =
             pull_single_archive(reader.clone(), snapshot, item, downloaded_chunks.clone()).await?;
         sync_stats.add(stats);
@@ -443,6 +459,7 @@ async fn pull_snapshot_from<'a>(
     reader: Arc<dyn SyncSourceReader + 'a>,
     snapshot: &'a pbs_datastore::BackupDir,
     downloaded_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
+    resync_corrupt: bool,
 ) -> Result<SyncStats, Error> {
     let (_path, is_new, _snap_lock) = snapshot
         .datastore()
@@ -451,7 +468,7 @@ async fn pull_snapshot_from<'a>(
     let sync_stats = if is_new {
         info!("sync snapshot {}", snapshot.dir());
 
-        match pull_snapshot(reader, snapshot, downloaded_chunks).await {
+        match pull_snapshot(reader, snapshot, downloaded_chunks, resync_corrupt).await {
             Err(err) => {
                 if let Err(cleanup_err) = snapshot.datastore().remove_backup_dir(
                     snapshot.backup_ns(),
@@ -469,7 +486,7 @@ async fn pull_snapshot_from<'a>(
         }
     } else {
         info!("re-sync snapshot {}", snapshot.dir());
-        pull_snapshot(reader, snapshot, downloaded_chunks).await?
+        pull_snapshot(reader, snapshot, downloaded_chunks, resync_corrupt).await?
     };
 
     Ok(sync_stats)
@@ -528,21 +545,24 @@ async fn pull_group(
         .enumerate()
         .filter(|&(pos, ref dir)| {
             source_snapshots.insert(dir.time);
-            if last_sync_time > dir.time {
-                already_synced_skip_info.update(dir.time);
-                return false;
-            } else if already_synced_skip_info.count > 0 {
-                info!("{already_synced_skip_info}");
-                already_synced_skip_info.reset();
-                return true;
-            }
+            // If resync_corrupt is set, we go through all the remote snapshots
+            if !params.resync_corrupt {
+                if last_sync_time > dir.time {
+                    already_synced_skip_info.update(dir.time);
+                    return false;
+                } else if already_synced_skip_info.count > 0 {
+                    info!("{already_synced_skip_info}");
+                    already_synced_skip_info.reset();
+                    return true;
+                }
 
-            if pos < cutoff && last_sync_time != dir.time {
-                transfer_last_skip_info.update(dir.time);
-                return false;
-            } else if transfer_last_skip_info.count > 0 {
-                info!("{transfer_last_skip_info}");
-                transfer_last_skip_info.reset();
+                if pos < cutoff && last_sync_time != dir.time {
+                    transfer_last_skip_info.update(dir.time);
+                    return false;
+                } else if transfer_last_skip_info.count > 0 {
+                    info!("{transfer_last_skip_info}");
+                    transfer_last_skip_info.reset();
+                }
             }
             true
         })
@@ -566,7 +586,13 @@ async fn pull_group(
             .source
             .reader(source_namespace, &from_snapshot)
             .await?;
-        let result = pull_snapshot_from(reader, &to_snapshot, downloaded_chunks.clone()).await;
+        let result = pull_snapshot_from(
+            reader,
+            &to_snapshot,
+            downloaded_chunks.clone(),
+            params.resync_corrupt,
+        )
+        .await;
 
         progress.done_snapshots = pos as u64 + 1;
         info!("percentage done: {progress}");
-- 
2.39.5





More information about the pbs-devel mailing list