[pbs-devel] [PATCH proxmox-backup 1/2] fix #6750: api: avoid possible deadlock on datastores with s3 backend
Fabian Grünbichler
f.gruenbichler at proxmox.com
Thu Sep 25 14:41:19 CEST 2025
On September 24, 2025 4:56 pm, Christian Ebner wrote:
> Closing of the fixed or dynamic index files with s3 backend will call
> async code, which must be avoided because of possible deadlocks [0].
> Therefore, perform all changes on the shared backup state and drop the
> guard before uploading the fixed index file to the s3 backend.
>
> Account for active backend operations and check consistency, since it
> must be assured that all active backend operations are finished before
> the finish call can succeed.
>
> [0] https://docs.rs/tokio/latest/tokio/sync/struct.Mutex.html#which-kind-of-mutex-should-you-use
>
> Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=6750
> Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
> ---
> src/api2/backup/environment.rs | 77 +++++++++++++++++++++++-----------
> 1 file changed, 53 insertions(+), 24 deletions(-)
>
> diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
> index d5e6869cd..e535891a4 100644
> --- a/src/api2/backup/environment.rs
> +++ b/src/api2/backup/environment.rs
> @@ -82,6 +82,7 @@ struct SharedBackupState {
> finished: bool,
> uid_counter: usize,
> file_counter: usize, // successfully uploaded files
> + active_backend_operations: usize,
> dynamic_writers: HashMap<usize, DynamicWriterState>,
> fixed_writers: HashMap<usize, FixedWriterState>,
> known_chunks: KnownChunksMap,
> @@ -135,6 +136,7 @@ impl BackupEnvironment {
> finished: false,
> uid_counter: 0,
> file_counter: 0,
> + active_backend_operations: 0,
> dynamic_writers: HashMap::new(),
> fixed_writers: HashMap::new(),
> known_chunks: HashMap::new(),
> @@ -483,15 +485,10 @@ impl BackupEnvironment {
> );
> }
>
> - // For S3 backends, upload the index file to the object store after closing
> - if let DatastoreBackend::S3(s3_client) = &self.backend {
> - self.s3_upload_index(s3_client, &data.name)
> - .context("failed to upload dynamic index to s3 backend")?;
> - self.log(format!(
> - "Uploaded dynamic index file to s3 backend: {}",
> - data.name
> - ))
> - }
> + state.file_counter += 1;
> + state.backup_size += size;
> + state.backup_stat = state.backup_stat + data.upload_stat;
> + state.active_backend_operations += 1;
>
> self.log_upload_stat(
> &data.name,
> @@ -502,9 +499,21 @@ impl BackupEnvironment {
> &data.upload_stat,
> );
>
> - state.file_counter += 1;
> - state.backup_size += size;
> - state.backup_stat = state.backup_stat + data.upload_stat;
> + // never hold mutex guard during s3 upload due to possible deadlocks
> + drop(state);
> +
> + // For S3 backends, upload the index file to the object store after closing
> + if let DatastoreBackend::S3(s3_client) = &self.backend {
> + self.s3_upload_index(s3_client, &data.name)
> + .context("failed to upload dynamic index to s3 backend")?;
> + self.log(format!(
> + "Uploaded dynamic index file to s3 backend: {}",
> + data.name
> + ))
> + }
> +
> + let mut state = self.state.lock().unwrap();
> + state.active_backend_operations -= 1;
these two hunks are okay, although we could also reuse the registered
writers map to encode whether a writer is active, being processed/closed
or doesn't exist? would allow more fine-grained logging..
> Ok(())
> }
> @@ -567,15 +576,10 @@ impl BackupEnvironment {
> );
> }
>
> - // For S3 backends, upload the index file to the object store after closing
> - if let DatastoreBackend::S3(s3_client) = &self.backend {
> - self.s3_upload_index(s3_client, &data.name)
> - .context("failed to upload fixed index to s3 backend")?;
> - self.log(format!(
> - "Uploaded fixed index file to object store: {}",
> - data.name
> - ))
> - }
> + state.file_counter += 1;
> + state.backup_size += size;
> + state.backup_stat = state.backup_stat + data.upload_stat;
> + state.active_backend_operations += 1;
>
> self.log_upload_stat(
> &data.name,
> @@ -586,9 +590,21 @@ impl BackupEnvironment {
> &data.upload_stat,
> );
>
> - state.file_counter += 1;
> - state.backup_size += size;
> - state.backup_stat = state.backup_stat + data.upload_stat;
> + // never hold mutex guard during s3 upload due to possible deadlocks
> + drop(state);
> +
> + // For S3 backends, upload the index file to the object store after closing
> + if let DatastoreBackend::S3(s3_client) = &self.backend {
> + self.s3_upload_index(s3_client, &data.name)
> + .context("failed to upload fixed index to s3 backend")?;
> + self.log(format!(
> + "Uploaded fixed index file to object store: {}",
> + data.name
> + ))
> + }
> +
> + let mut state = self.state.lock().unwrap();
> + state.active_backend_operations -= 1;
>
> Ok(())
> }
> @@ -645,6 +661,13 @@ impl BackupEnvironment {
> bail!("found open index writer - unable to finish backup");
> }
>
> + if state.active_backend_operations != 0 {
> + bail!(
> + "backup task still has {} active operations.",
> + state.active_backend_operations,
> + );
> + }
> +
> if state.file_counter == 0 {
> bail!("backup does not contain valid files (file count == 0)");
> }
> @@ -753,6 +776,12 @@ impl BackupEnvironment {
> if !state.finished {
> bail!("backup ended but finished flag is not set.");
> }
> + if state.active_backend_operations != 0 {
> + bail!(
> + "backup ended but {} active backend operations.",
> + state.active_backend_operations,
> + );
> + }
there's now an inconsistency between ensure_finished(), which checks both
conditions, and finished() (used to determine whether a connection being
interrupted is benign or not!), which just checks the finished flag..
> Ok(())
> }
>
> --
> 2.47.3
>
>
>
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
>
>
>
More information about the pbs-devel
mailing list