[pbs-devel] [PATCH proxmox-backup v3 1/3] partial fix #3701: sync job: pull: add transfer-last parameter

Mon Apr 17 10:56:13 CEST 2023

On April 13, 2023 5:41 pm, Stefan Hanreich wrote:
> Specifying the transfer-last parameter limits the amount of backups
> that get synced via the pull command/sync job. The parameter specifies
> how many of the N latest backups should get pulled/synced. All other
> backups will get skipped.
> 
> This is particularly useful in situations where the sync target has
> less disk space than the source. Syncing all backups from the source
> is not possible if there is not enough disk space on the target.
> Additionally this can be used for limiting the amount of data
> transferred, reducing load on the network.
> 
> The newest backup will always get re-synced, regardless of the setting
> of the transfer-last parameter.

we had some off-list discussion about the logging issue, so mostly
rehashing this here..

> 
> Signed-off-by: Stefan Hanreich <s.hanreich at proxmox.com>
> ---
>  pbs-api-types/src/jobs.rs         | 11 +++++
>  src/api2/config/sync.rs           |  9 ++++
>  src/api2/pull.rs                  | 10 ++++-
>  src/bin/proxmox-backup-manager.rs | 11 ++++-
>  src/server/pull.rs                | 68 +++++++++++++++++++++++--------
>  5 files changed, 91 insertions(+), 18 deletions(-)
> 
> diff --git a/pbs-api-types/src/jobs.rs b/pbs-api-types/src/jobs.rs
> index cf7618c4..b9f57719 100644
> --- a/pbs-api-types/src/jobs.rs
> +++ b/pbs-api-types/src/jobs.rs
> @@ -444,6 +444,11 @@ pub const GROUP_FILTER_SCHEMA: Schema = StringSchema::new(
>  pub const GROUP_FILTER_LIST_SCHEMA: Schema =
>      ArraySchema::new("List of group filters.", &GROUP_FILTER_SCHEMA).schema();
>  
> +pub const TRANSFER_LAST_SCHEMA: Schema =
> +    IntegerSchema::new("The maximum amount of snapshots to be transferred (per group).")

nit: the description could also be read as "only transfer X snapshots in
this sync invocation", maybe we can find some phrasing that includes
- if there are more snapshots, they are skipped
- count starts at the end, not at the beginning (implied by "last" in
  the name)

maybe something like

"Limit transfer to last N snapshots (per group), skipping others."

> +        .minimum(1)
> +        .schema();
> +
>  #[api(
>      properties: {
>          id: {
> @@ -493,6 +498,10 @@ pub const GROUP_FILTER_LIST_SCHEMA: Schema =
>              schema: GROUP_FILTER_LIST_SCHEMA,
>              optional: true,
>          },
> +        "transfer-last": {
> +            schema: TRANSFER_LAST_SCHEMA,
> +            optional: true,
> +        },
>      }
>  )]
>  #[derive(Serialize, Deserialize, Clone, Updater, PartialEq)]
> @@ -522,6 +531,8 @@ pub struct SyncJobConfig {
>      pub group_filter: Option<Vec<GroupFilter>>,
>      #[serde(flatten)]
>      pub limit: RateLimitConfig,
> +    #[serde(skip_serializing_if = "Option::is_none")]
> +    pub transfer_last: Option<usize>,
>  }
>  
>  impl SyncJobConfig {
> diff --git a/src/api2/config/sync.rs b/src/api2/config/sync.rs
> index bd7373df..01e5f2ce 100644
> --- a/src/api2/config/sync.rs
> +++ b/src/api2/config/sync.rs
> @@ -215,6 +215,8 @@ pub enum DeletableProperty {
>      RemoteNs,
>      /// Delete the max_depth property,
>      MaxDepth,
> +    /// Delete the transfer_last property,
> +    TransferLast,
>  }
>  
>  #[api(
> @@ -309,6 +311,9 @@ pub fn update_sync_job(
>                  DeletableProperty::MaxDepth => {
>                      data.max_depth = None;
>                  }
> +                DeletableProperty::TransferLast => {
> +                    data.transfer_last = None;
> +                }
>              }
>          }
>      }
> @@ -343,6 +348,9 @@ pub fn update_sync_job(
>      if let Some(group_filter) = update.group_filter {
>          data.group_filter = Some(group_filter);
>      }
> +    if let Some(transfer_last) = update.transfer_last {
> +        data.transfer_last = Some(transfer_last);
> +    }
>  
>      if update.limit.rate_in.is_some() {
>          data.limit.rate_in = update.limit.rate_in;
> @@ -507,6 +515,7 @@ acl:1:/remote/remote1/remotestore1:write at pbs:RemoteSyncOperator
>          group_filter: None,
>          schedule: None,
>          limit: pbs_api_types::RateLimitConfig::default(), // no limit
> +        transfer_last: None,
>      };
>  
>      // should work without ACLs
> diff --git a/src/api2/pull.rs b/src/api2/pull.rs
> index b2473ec8..daeba7cf 100644
> --- a/src/api2/pull.rs
> +++ b/src/api2/pull.rs
> @@ -10,6 +10,7 @@ use pbs_api_types::{
>      Authid, BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA,
>      GROUP_FILTER_LIST_SCHEMA, NS_MAX_DEPTH_REDUCED_SCHEMA, PRIV_DATASTORE_BACKUP,
>      PRIV_DATASTORE_PRUNE, PRIV_REMOTE_READ, REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA,
> +    TRANSFER_LAST_SCHEMA,
>  };
>  use pbs_config::CachedUserInfo;
>  use proxmox_rest_server::WorkerTask;
> @@ -76,6 +77,7 @@ impl TryFrom<&SyncJobConfig> for PullParameters {
>              sync_job.max_depth,
>              sync_job.group_filter.clone(),
>              sync_job.limit.clone(),
> +            sync_job.transfer_last,
>          )
>      }
>  }
> @@ -201,7 +203,11 @@ pub fn do_sync_job(
>              limit: {
>                  type: RateLimitConfig,
>                  flatten: true,
> -            }
> +            },
> +            "transfer-last": {
> +                schema: TRANSFER_LAST_SCHEMA,
> +                optional: true,
> +            },
>          },
>      },
>      access: {
> @@ -225,6 +231,7 @@ async fn pull(
>      max_depth: Option<usize>,
>      group_filter: Option<Vec<GroupFilter>>,
>      limit: RateLimitConfig,
> +    transfer_last: Option<usize>,
>      rpcenv: &mut dyn RpcEnvironment,
>  ) -> Result<String, Error> {
>      let auth_id: Authid = rpcenv.get_auth_id().unwrap().parse()?;
> @@ -257,6 +264,7 @@ async fn pull(
>          max_depth,
>          group_filter,
>          limit,
> +        transfer_last,
>      )?;
>      let client = pull_params.client().await?;
>  
> diff --git a/src/bin/proxmox-backup-manager.rs b/src/bin/proxmox-backup-manager.rs
> index 740fdc49..b4cb6cb3 100644
> --- a/src/bin/proxmox-backup-manager.rs
> +++ b/src/bin/proxmox-backup-manager.rs
> @@ -13,7 +13,7 @@ use pbs_api_types::percent_encoding::percent_encode_component;
>  use pbs_api_types::{
>      BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA,
>      GROUP_FILTER_LIST_SCHEMA, IGNORE_VERIFIED_BACKUPS_SCHEMA, NS_MAX_DEPTH_SCHEMA,
> -    REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, UPID_SCHEMA,
> +    REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, TRANSFER_LAST_SCHEMA, UPID_SCHEMA,
>      VERIFICATION_OUTDATED_AFTER_SCHEMA,
>  };
>  use pbs_client::{display_task_log, view_task_result};
> @@ -272,6 +272,10 @@ fn task_mgmt_cli() -> CommandLineInterface {
>                  schema: OUTPUT_FORMAT,
>                  optional: true,
>              },
> +            "transfer-last": {
> +                schema: TRANSFER_LAST_SCHEMA,
> +                optional: true,
> +            },
>          }
>     }
>  )]
> @@ -287,6 +291,7 @@ async fn pull_datastore(
>      max_depth: Option<usize>,
>      group_filter: Option<Vec<GroupFilter>>,
>      limit: RateLimitConfig,
> +    transfer_last: Option<usize>,
>      param: Value,
>  ) -> Result<Value, Error> {
>      let output_format = get_output_format(&param);
> @@ -319,6 +324,10 @@ async fn pull_datastore(
>          args["remove-vanished"] = Value::from(remove_vanished);
>      }
>  
> +    if transfer_last.is_some() {
> +        args["transfer-last"] = json!(transfer_last)
> +    }
> +
>      let mut limit_json = json!(limit);
>      let limit_map = limit_json
>          .as_object_mut()
> diff --git a/src/server/pull.rs b/src/server/pull.rs
> index 65eedf2c..37058f9b 100644
> --- a/src/server/pull.rs
> +++ b/src/server/pull.rs
> @@ -59,6 +59,8 @@ pub(crate) struct PullParameters {
>      group_filter: Option<Vec<GroupFilter>>,
>      /// Rate limits for all transfers from `remote`
>      limit: RateLimitConfig,
> +    /// How many snapshots should be transferred at most (taking the newest N snapshots)
> +    transfer_last: Option<usize>,
>  }
>  
>  impl PullParameters {
> @@ -78,6 +80,7 @@ impl PullParameters {
>          max_depth: Option<usize>,
>          group_filter: Option<Vec<GroupFilter>>,
>          limit: RateLimitConfig,
> +        transfer_last: Option<usize>,
>      ) -> Result<Self, Error> {
>          let store = DataStore::lookup_datastore(store, Some(Operation::Write))?;
>  
> @@ -109,6 +112,7 @@ impl PullParameters {
>              max_depth,
>              group_filter,
>              limit,
> +            transfer_last,
>          })
>      }
>  
> @@ -537,15 +541,24 @@ async fn pull_snapshot_from(
>      Ok(())
>  }
>  
> +enum SkipReason {
> +    AlreadySynced,
> +    TransferLast,
> +}
> +

this could be kept, but

>  struct SkipInfo {
>      oldest: i64,
>      newest: i64,
> -    count: u64,
> +    already_synced_count: u64,
> +    transfer_last_count: u64,
>  }

instead of this, we could just store the reason here.

>  
>  impl SkipInfo {
> -    fn update(&mut self, backup_time: i64) {
> -        self.count += 1;
> +    fn update(&mut self, backup_time: i64, skip_reason: SkipReason) {
> +        match skip_reason {
> +            SkipReason::AlreadySynced => self.already_synced_count += 1,
> +            SkipReason::TransferLast => self.transfer_last_count += 1,
> +        }

this would then not be needed

>  
>          if backup_time < self.oldest {
>              self.oldest = backup_time;
> @@ -556,8 +569,13 @@ impl SkipInfo {
>          }
>      }
>  
> +    fn count(&self) -> u64 {
> +        self.already_synced_count
> +            .saturating_add(self.transfer_last_count)
> +    }
> +

neither would this

>      fn affected(&self) -> Result<String, Error> {
> -        match self.count {
> +        match self.count() {

or this

>              0 => Ok(String::new()),
>              1 => Ok(proxmox_time::epoch_to_rfc3339_utc(self.oldest)?),
>              _ => Ok(format!(
> @@ -574,12 +592,23 @@ impl std::fmt::Display for SkipInfo {
>          write!(
>              f,
>              "skipped: {} snapshot(s) ({}) older than the newest local snapshot",
> -            self.count,
> +            self.count(),

here, only the format string would need to be conditionalized based on
the `reason`..

>              self.affected().map_err(|_| std::fmt::Error)?
>          )
>      }
>  }
>  
> +impl Default for SkipInfo {
> +    fn default() -> Self {
> +        SkipInfo {
> +            oldest: i64::MAX,
> +            newest: i64::MIN,
> +            already_synced_count: 0,
> +            transfer_last_count: 0,
> +        }
> +    }
> +}
> +

not sure whether this is needed.. probably we'd rather have a
constructor taking a `SkipReason`?

>  /// Pulls a group according to `params`.
>  ///
>  /// Pulling a group consists of the following steps:
> @@ -632,6 +661,7 @@ async fn pull_group(
>      let fingerprint = client.fingerprint();
>  
>      let last_sync = params.store.last_successful_backup(&target_ns, group)?;
> +    let last_sync_time = last_sync.unwrap_or(i64::MIN);
>  
>      let mut remote_snapshots = std::collections::HashSet::new();
>  
> @@ -640,11 +670,14 @@ async fn pull_group(
>  
>      progress.group_snapshots = list.len() as u64;
>  
> -    let mut skip_info = SkipInfo {
> -        oldest: i64::MAX,
> -        newest: i64::MIN,
> -        count: 0,
> -    };
> +    let mut skip_info = SkipInfo::default();

here, we would then construct 2 `SkipInfo`s, one for "old" snapshots,
one for "new, but skipped by transfer-last".

> +
> +    let total_amount = list.len();
> +
> +    let cutoff = params
> +        .transfer_last
> +        .map(|count| total_amount.saturating_sub(count))
> +        .unwrap_or_default();
>  
>      for (pos, item) in list.into_iter().enumerate() {
>          let snapshot = item.backup;
> @@ -661,11 +694,14 @@ async fn pull_group(
>  
>          remote_snapshots.insert(snapshot.time);
>  
> -        if let Some(last_sync_time) = last_sync {
> -            if last_sync_time > snapshot.time {
> -                skip_info.update(snapshot.time);
> -                continue;
> -            }
> +        if last_sync_time > snapshot.time {
> +            skip_info.update(snapshot.time, SkipReason::AlreadySynced);
> +            continue;
> +        }
> +
> +        if pos < cutoff && last_sync_time != snapshot.time {
> +            skip_info.update(snapshot.time, SkipReason::TransferLast);
> +            continue;
>          }
>  

and ideally here, we could find good criteria for printing the
corresponding SkipInfo at the point were we switch from skipping to
transferring for both cases..

maybe

else if info.count > 0 {
  print info;
  clear info; // prevents taking this if again
}

for the first one and

else if pos >= cutoff && other_info.count > 0 {
  ..
}

for the second one would be enough?

that way the sequence of logs would be:

- skipped old (happens in most jobs, except for first run)
- re-sync of last synced snapshot (if it still exists on source)
- skipped because of transfer-last (if set and skips something)
- sync of new snapshots (if they exist)

which fits the chronological order of the snapshots contained in the log
messages, which would be nicer compared to

- re-sync last synced snapshot
- sync new snapshots
- print skip info of old snapshots
- print skip info of transfer-last snapshots

in my opinion.

in any case, tracking the two skip infos separately allows us to
- give more information (end of regular skip, start of transfer-last
  skip - there can be a snapshot inbetween that is re-synced after all)
- not handle newlines in a weird way
- save a line in the log, since we can just print each on one line,
  instead of the combined one + two lines for the counts

if we ever add more criteria on the snapshot level that cause skipping,
we can still re-evaluate and possible have some kind of map/vec for the
counts, and move the reason to the key/index, but for two kinds there's
no reason to not keep it simple for now IMHO.

>          // get updated auth_info (new tickets)
> @@ -725,7 +761,7 @@ async fn pull_group(
>          }
>      }
>  
> -    if skip_info.count > 0 {
> +    if skip_info.count() > 0 {
>          task_log!(worker, "{}", skip_info);
>      }
>  
> -- 
> 2.30.2
> 
> 
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
> 
> 
>