[pbs-devel] [PATCH proxmox-backup v3 1/3] fix #3786: api: add resync-corrupt option to sync jobs
Fabian Grünbichler
f.gruenbichler at proxmox.com
Wed Nov 20 14:11:04 CET 2024
a few small nits inline, looks good to me otherwise, but given the size of this
and the size of the push series, I'd rather this be rebased on top of the other
one ;)
Quoting Gabriel Goller (2024-11-05 11:40:13)
> This option allows us to "fix" corrupt snapshots (and/or their chunks)
> by pulling them from another remote. When traversing the remote
> snapshots, we check if it exists locally, and if it is, we check if the
> last verification of it failed. If the local snapshot is broken and the
> `resync-corrupt` option is turned on, we pull in the remote snapshot,
> overwriting the local one.
>
> This is very useful and has been requested a lot, as there is currently
> no way to "fix" corrupt chunks/snapshots even if the user has a healthy
> version of it on their offsite instance.
>
> Originally-by: Shannon Sterz <s.sterz at proxmox.com>
> Signed-off-by: Gabriel Goller <g.goller at proxmox.com>
> ---
> pbs-api-types/src/jobs.rs | 10 +++++
> pbs-datastore/src/backup_info.rs | 12 +++++-
> pbs-datastore/src/manifest.rs | 13 ++++++-
> src/api2/config/sync.rs | 4 ++
> src/api2/pull.rs | 9 ++++-
> src/bin/proxmox-backup-manager.rs | 4 +-
> src/server/pull.rs | 62 +++++++++++++++++++++++--------
> 7 files changed, 93 insertions(+), 21 deletions(-)
>
> diff --git a/pbs-api-types/src/jobs.rs b/pbs-api-types/src/jobs.rs
> index 868702bc059e..58f739ad00b5 100644
> --- a/pbs-api-types/src/jobs.rs
> +++ b/pbs-api-types/src/jobs.rs
> @@ -498,6 +498,10 @@ pub const TRANSFER_LAST_SCHEMA: Schema =
> .minimum(1)
> .schema();
>
> +pub const RESYNC_CORRUPT_SCHEMA: Schema =
> + BooleanSchema::new("If the verification failed for a local snapshot, try to pull it again.")
> + .schema();
> +
> #[api(
> properties: {
> id: {
> @@ -552,6 +556,10 @@ pub const TRANSFER_LAST_SCHEMA: Schema =
> schema: TRANSFER_LAST_SCHEMA,
> optional: true,
> },
> + "resync-corrupt": {
> + schema: RESYNC_CORRUPT_SCHEMA,
> + optional: true,
> + }
> }
> )]
> #[derive(Serialize, Deserialize, Clone, Updater, PartialEq)]
> @@ -585,6 +593,8 @@ pub struct SyncJobConfig {
> pub limit: RateLimitConfig,
> #[serde(skip_serializing_if = "Option::is_none")]
> pub transfer_last: Option<usize>,
> + #[serde(skip_serializing_if = "Option::is_none")]
> + pub resync_corrupt: Option<bool>,
> }
>
> impl SyncJobConfig {
> diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_info.rs
> index 414ec878d01a..e6174322dad6 100644
> --- a/pbs-datastore/src/backup_info.rs
> +++ b/pbs-datastore/src/backup_info.rs
> @@ -8,7 +8,8 @@ use anyhow::{bail, format_err, Error};
> use proxmox_sys::fs::{lock_dir_noblock, replace_file, CreateOptions};
>
> use pbs_api_types::{
> - Authid, BackupNamespace, BackupType, GroupFilter, BACKUP_DATE_REGEX, BACKUP_FILE_REGEX,
> + Authid, BackupNamespace, BackupType, GroupFilter, VerifyState, BACKUP_DATE_REGEX,
> + BACKUP_FILE_REGEX,
> };
> use pbs_config::{open_backup_lockfile, BackupLockGuard};
>
> @@ -583,6 +584,15 @@ impl BackupDir {
>
> Ok(())
> }
> +
> + /// Load the verify state from the manifest.
> + pub fn verify_state(&self) -> Option<VerifyState> {
should this be a Result<Option<..>> to allow differentiation between no
verification state, and failure to parse or load the manifest? that would allow
us to resync totally corrupted snapshots as well (although that might be
considered out of scope based on the parameter description ;))
> + if let Ok(manifest) = self.load_manifest() {
> + manifest.0.verify_state()
> + } else {
> + None
> + }
> + }
> }
>
> impl AsRef<pbs_api_types::BackupNamespace> for BackupDir {
> diff --git a/pbs-datastore/src/manifest.rs b/pbs-datastore/src/manifest.rs
> index c3df014272a0..623c1499c0bb 100644
> --- a/pbs-datastore/src/manifest.rs
> +++ b/pbs-datastore/src/manifest.rs
> @@ -5,7 +5,7 @@ use anyhow::{bail, format_err, Error};
> use serde::{Deserialize, Serialize};
> use serde_json::{json, Value};
>
> -use pbs_api_types::{BackupType, CryptMode, Fingerprint};
> +use pbs_api_types::{BackupType, CryptMode, Fingerprint, SnapshotVerifyState, VerifyState};
> use pbs_tools::crypt_config::CryptConfig;
>
> pub const MANIFEST_BLOB_NAME: &str = "index.json.blob";
> @@ -242,6 +242,17 @@ impl BackupManifest {
> let manifest: BackupManifest = serde_json::from_value(json)?;
> Ok(manifest)
> }
> +
> + /// Get the verify state of the snapshot
> + ///
> + /// Note: New snapshots, which have not been verified yet, do not have a status and this
> + /// function will return `None`.
> + pub fn verify_state(&self) -> Option<VerifyState> {
should this be a Result<Option<..>> to allow differentiation between no
verification state, and failure to parse?
also, it would be great if existing code retrieving this could be adapted to
use these new helpers, which would require having the Result there as well..
> + let verify = self.unprotected["verify_state"].clone();
> + serde_json::from_value::<SnapshotVerifyState>(verify)
> + .map(|svs| svs.state)
> + .ok()
> + }
> }
>
> impl TryFrom<super::DataBlob> for BackupManifest {
> diff --git a/src/api2/config/sync.rs b/src/api2/config/sync.rs
> index 6fdc69a9e645..fa9db92f3d11 100644
> --- a/src/api2/config/sync.rs
> +++ b/src/api2/config/sync.rs
> @@ -368,6 +368,9 @@ pub fn update_sync_job(
> if let Some(transfer_last) = update.transfer_last {
> data.transfer_last = Some(transfer_last);
> }
> + if let Some(resync_corrupt) = update.resync_corrupt {
> + data.resync_corrupt = Some(resync_corrupt);
> + }
>
> if update.limit.rate_in.is_some() {
> data.limit.rate_in = update.limit.rate_in;
> @@ -527,6 +530,7 @@ acl:1:/remote/remote1/remotestore1:write at pbs:RemoteSyncOperator
> ns: None,
> owner: Some(write_auth_id.clone()),
> comment: None,
> + resync_corrupt: None,
> remove_vanished: None,
> max_depth: None,
> group_filter: None,
> diff --git a/src/api2/pull.rs b/src/api2/pull.rs
> index e733c9839e3a..0d4be0e2d228 100644
> --- a/src/api2/pull.rs
> +++ b/src/api2/pull.rs
> @@ -10,7 +10,7 @@ use pbs_api_types::{
> Authid, BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA,
> GROUP_FILTER_LIST_SCHEMA, NS_MAX_DEPTH_REDUCED_SCHEMA, PRIV_DATASTORE_BACKUP,
> PRIV_DATASTORE_PRUNE, PRIV_REMOTE_READ, REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA,
> - TRANSFER_LAST_SCHEMA,
> + RESYNC_CORRUPT_SCHEMA, TRANSFER_LAST_SCHEMA,
> };
> use pbs_config::CachedUserInfo;
> use proxmox_human_byte::HumanByte;
> @@ -89,6 +89,7 @@ impl TryFrom<&SyncJobConfig> for PullParameters {
> sync_job.group_filter.clone(),
> sync_job.limit.clone(),
> sync_job.transfer_last,
> + sync_job.resync_corrupt,
> )
> }
> }
> @@ -240,6 +241,10 @@ pub fn do_sync_job(
> schema: TRANSFER_LAST_SCHEMA,
> optional: true,
> },
> + "resync-corrupt": {
> + schema: RESYNC_CORRUPT_SCHEMA,
> + optional: true,
> + },
> },
> },
> access: {
> @@ -264,6 +269,7 @@ async fn pull(
> group_filter: Option<Vec<GroupFilter>>,
> limit: RateLimitConfig,
> transfer_last: Option<usize>,
> + resync_corrupt: Option<bool>,
> rpcenv: &mut dyn RpcEnvironment,
> ) -> Result<String, Error> {
> let auth_id: Authid = rpcenv.get_auth_id().unwrap().parse()?;
> @@ -301,6 +307,7 @@ async fn pull(
> group_filter,
> limit,
> transfer_last,
> + resync_corrupt,
> )?;
>
> // fixme: set to_stdout to false?
> diff --git a/src/bin/proxmox-backup-manager.rs b/src/bin/proxmox-backup-manager.rs
> index 420e96665662..38a1cf0f5881 100644
> --- a/src/bin/proxmox-backup-manager.rs
> +++ b/src/bin/proxmox-backup-manager.rs
> @@ -14,8 +14,8 @@ use pbs_api_types::percent_encoding::percent_encode_component;
> use pbs_api_types::{
> BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA,
> GROUP_FILTER_LIST_SCHEMA, IGNORE_VERIFIED_BACKUPS_SCHEMA, NS_MAX_DEPTH_SCHEMA,
> - REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, TRANSFER_LAST_SCHEMA, UPID_SCHEMA,
> - VERIFICATION_OUTDATED_AFTER_SCHEMA,
> + REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, RESYNC_CORRUPT_SCHEMA, TRANSFER_LAST_SCHEMA,
> + UPID_SCHEMA, VERIFICATION_OUTDATED_AFTER_SCHEMA,
> };
> use pbs_client::{display_task_log, view_task_result};
> use pbs_config::sync;
> diff --git a/src/server/pull.rs b/src/server/pull.rs
> index d9584776ee7f..11a0a9d74cf3 100644
> --- a/src/server/pull.rs
> +++ b/src/server/pull.rs
> @@ -12,7 +12,8 @@ use tracing::info;
>
> use pbs_api_types::{
> print_store_and_ns, Authid, BackupDir, BackupGroup, BackupNamespace, GroupFilter, Operation,
> - RateLimitConfig, Remote, MAX_NAMESPACE_DEPTH, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_BACKUP,
> + RateLimitConfig, Remote, VerifyState, MAX_NAMESPACE_DEPTH, PRIV_DATASTORE_AUDIT,
> + PRIV_DATASTORE_BACKUP,
> };
> use pbs_client::BackupRepository;
> use pbs_config::CachedUserInfo;
> @@ -55,6 +56,8 @@ pub(crate) struct PullParameters {
> group_filter: Vec<GroupFilter>,
> /// How many snapshots should be transferred at most (taking the newest N snapshots)
> transfer_last: Option<usize>,
> + /// Whether to re-sync corrupted snapshots
> + resync_corrupt: bool,
> }
>
> impl PullParameters {
> @@ -72,12 +75,14 @@ impl PullParameters {
> group_filter: Option<Vec<GroupFilter>>,
> limit: RateLimitConfig,
> transfer_last: Option<usize>,
> + resync_corrupt: Option<bool>,
> ) -> Result<Self, Error> {
> if let Some(max_depth) = max_depth {
> ns.check_max_depth(max_depth)?;
> remote_ns.check_max_depth(max_depth)?;
> };
> let remove_vanished = remove_vanished.unwrap_or(false);
> + let resync_corrupt = resync_corrupt.unwrap_or(false);
>
> let source: Arc<dyn SyncSource> = if let Some(remote) = remote {
> let (remote_config, _digest) = pbs_config::remote::config()?;
> @@ -116,6 +121,7 @@ impl PullParameters {
> max_depth,
> group_filter,
> transfer_last,
> + resync_corrupt,
> })
> }
> }
> @@ -323,7 +329,7 @@ async fn pull_single_archive<'a>(
> ///
> /// Pulling a snapshot consists of the following steps:
> /// - (Re)download the manifest
> -/// -- if it matches, only download log and treat snapshot as already synced
> +/// -- if it matches and is not corrupt, only download log and treat snapshot as already synced
> /// - Iterate over referenced files
> /// -- if file already exists, verify contents
> /// -- if not, pull it from the remote
> @@ -332,6 +338,7 @@ async fn pull_snapshot<'a>(
> reader: Arc<dyn SyncSourceReader + 'a>,
> snapshot: &'a pbs_datastore::BackupDir,
> downloaded_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
> + corrupt: bool,
> ) -> Result<SyncStats, Error> {
> let mut sync_stats = SyncStats::default();
> let mut manifest_name = snapshot.full_path();
> @@ -352,7 +359,7 @@ async fn pull_snapshot<'a>(
> return Ok(sync_stats);
> }
>
> - if manifest_name.exists() {
> + if manifest_name.exists() && !corrupt {
> let manifest_blob = proxmox_lang::try_block!({
> let mut manifest_file = std::fs::File::open(&manifest_name).map_err(|err| {
> format_err!("unable to open local manifest {manifest_name:?} - {err}")
> @@ -381,7 +388,7 @@ async fn pull_snapshot<'a>(
> let mut path = snapshot.full_path();
> path.push(&item.filename);
>
> - if path.exists() {
> + if !corrupt && path.exists() {
> match ArchiveType::from_path(&item.filename)? {
> ArchiveType::DynamicIndex => {
> let index = DynamicIndexReader::open(&path)?;
> @@ -443,6 +450,7 @@ async fn pull_snapshot_from<'a>(
> reader: Arc<dyn SyncSourceReader + 'a>,
> snapshot: &'a pbs_datastore::BackupDir,
> downloaded_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
> + corrupt: bool,
> ) -> Result<SyncStats, Error> {
> let (_path, is_new, _snap_lock) = snapshot
> .datastore()
> @@ -451,7 +459,7 @@ async fn pull_snapshot_from<'a>(
> let sync_stats = if is_new {
is_new and corrupt are never both true..
> info!("sync snapshot {}", snapshot.dir());
>
> - match pull_snapshot(reader, snapshot, downloaded_chunks).await {
> + match pull_snapshot(reader, snapshot, downloaded_chunks, corrupt).await {
so this should be always false ;)
> Err(err) => {
> if let Err(cleanup_err) = snapshot.datastore().remove_backup_dir(
> snapshot.backup_ns(),
> @@ -468,8 +476,15 @@ async fn pull_snapshot_from<'a>(
> }
> }
> } else {
> - info!("re-sync snapshot {}", snapshot.dir());
> - pull_snapshot(reader, snapshot, downloaded_chunks).await?
> + if corrupt {
> + info!(
> + "re-sync snapshot {} due to bad verification result",
nit: why not call it "corrupt", since that is what the parameter is called?
> + snapshot.dir()
> + );
> + } else {
> + info!("re-sync snapshot {}", snapshot.dir());
> + }
> + pull_snapshot(reader, snapshot, downloaded_chunks, corrupt).await?
> };
>
> Ok(sync_stats)
> @@ -523,26 +538,40 @@ async fn pull_group(
> .last_successful_backup(&target_ns, group)?
> .unwrap_or(i64::MIN);
>
> - let list: Vec<BackupDir> = raw_list
> + // Filter remote BackupDirs to include in pull
> + // Also stores if the snapshot is corrupt (verification job failed)
> + let list: Vec<(BackupDir, bool)> = raw_list
> .into_iter()
> .enumerate()
> - .filter(|&(pos, ref dir)| {
> + .filter_map(|(pos, dir)| {
> source_snapshots.insert(dir.time);
> + // If resync_corrupt is set, check if the corresponding local snapshot failed to
> + // verification
> + if params.resync_corrupt {
> + let local_dir = params
> + .target
> + .store
> + .backup_dir(target_ns.clone(), dir.clone());
> + if let Ok(local_dir) = local_dir {
> + let verify_state = local_dir.verify_state();
> + if verify_state == Some(VerifyState::Failed) {
> + return Some((dir, true));
> + }
> + }
> + }
> // Note: the snapshot represented by `last_sync_time` might be missing its backup log
> // or post-backup verification state if those were not yet available during the last
> // sync run, always resync it
> if last_sync_time > dir.time {
> already_synced_skip_info.update(dir.time);
> - return false;
> + return None;
> }
> -
> if pos < cutoff && last_sync_time != dir.time {
> transfer_last_skip_info.update(dir.time);
> - return false;
> + return None;
> }
> - true
> + Some((dir, false))
> })
> - .map(|(_, dir)| dir)
> .collect();
>
> if already_synced_skip_info.count > 0 {
> @@ -561,7 +590,7 @@ async fn pull_group(
>
> let mut sync_stats = SyncStats::default();
>
> - for (pos, from_snapshot) in list.into_iter().enumerate() {
> + for (pos, (from_snapshot, corrupt)) in list.into_iter().enumerate() {
> let to_snapshot = params
> .target
> .store
> @@ -571,7 +600,8 @@ async fn pull_group(
> .source
> .reader(source_namespace, &from_snapshot)
> .await?;
> - let result = pull_snapshot_from(reader, &to_snapshot, downloaded_chunks.clone()).await;
> + let result =
> + pull_snapshot_from(reader, &to_snapshot, downloaded_chunks.clone(), corrupt).await;
>
> progress.done_snapshots = pos as u64 + 1;
> info!("percentage done: {progress}");
> --
> 2.39.5
>
>
>
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
>
>
More information about the pbs-devel
mailing list