[pbs-devel] [PATCH proxmox-backup 1/2] fix #6750: api: avoid possible deadlock on datastores with s3 backend

Fabian Grünbichler f.gruenbichler at proxmox.com
Thu Sep 25 14:41:19 CEST 2025


On September 24, 2025 4:56 pm, Christian Ebner wrote:
> Closing of the fixed or dynamic index files with s3 backend will call
> async code, which must be avoided because of possible deadlocks [0].
> Therefore, perform all changes on the shared backup state and drop the
> guard before uploading the fixed index file to the s3 backend.
> 
> Account for active backend operations and check consistency, since it
> must be assured that all active backend operations are finished before
> the finish call can succeed.
> 
> [0] https://docs.rs/tokio/latest/tokio/sync/struct.Mutex.html#which-kind-of-mutex-should-you-use
> 
> Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=6750
> Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
> ---
>  src/api2/backup/environment.rs | 77 +++++++++++++++++++++++-----------
>  1 file changed, 53 insertions(+), 24 deletions(-)
> 
> diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
> index d5e6869cd..e535891a4 100644
> --- a/src/api2/backup/environment.rs
> +++ b/src/api2/backup/environment.rs
> @@ -82,6 +82,7 @@ struct SharedBackupState {
>      finished: bool,
>      uid_counter: usize,
>      file_counter: usize, // successfully uploaded files
> +    active_backend_operations: usize,
>      dynamic_writers: HashMap<usize, DynamicWriterState>,
>      fixed_writers: HashMap<usize, FixedWriterState>,
>      known_chunks: KnownChunksMap,
> @@ -135,6 +136,7 @@ impl BackupEnvironment {
>              finished: false,
>              uid_counter: 0,
>              file_counter: 0,
> +            active_backend_operations: 0,
>              dynamic_writers: HashMap::new(),
>              fixed_writers: HashMap::new(),
>              known_chunks: HashMap::new(),
> @@ -483,15 +485,10 @@ impl BackupEnvironment {
>              );
>          }
>  
> -        // For S3 backends, upload the index file to the object store after closing
> -        if let DatastoreBackend::S3(s3_client) = &self.backend {
> -            self.s3_upload_index(s3_client, &data.name)
> -                .context("failed to upload dynamic index to s3 backend")?;
> -            self.log(format!(
> -                "Uploaded dynamic index file to s3 backend: {}",
> -                data.name
> -            ))
> -        }
> +        state.file_counter += 1;
> +        state.backup_size += size;
> +        state.backup_stat = state.backup_stat + data.upload_stat;
> +        state.active_backend_operations += 1;
>  
>          self.log_upload_stat(
>              &data.name,
> @@ -502,9 +499,21 @@ impl BackupEnvironment {
>              &data.upload_stat,
>          );
>  
> -        state.file_counter += 1;
> -        state.backup_size += size;
> -        state.backup_stat = state.backup_stat + data.upload_stat;
> +        // never hold mutex guard during s3 upload due to possible deadlocks
> +        drop(state);
> +
> +        // For S3 backends, upload the index file to the object store after closing
> +        if let DatastoreBackend::S3(s3_client) = &self.backend {
> +            self.s3_upload_index(s3_client, &data.name)
> +                .context("failed to upload dynamic index to s3 backend")?;
> +            self.log(format!(
> +                "Uploaded dynamic index file to s3 backend: {}",
> +                data.name
> +            ))
> +        }
> +
> +        let mut state = self.state.lock().unwrap();
> +        state.active_backend_operations -= 1;

these two hunks are okay, although we could also reuse the registered
writers map to encode whether a writer is active, being processed/closed
or doesn't exist? would allow more fine-grained logging..

>          Ok(())
>      }
> @@ -567,15 +576,10 @@ impl BackupEnvironment {
>              );
>          }
>  
> -        // For S3 backends, upload the index file to the object store after closing
> -        if let DatastoreBackend::S3(s3_client) = &self.backend {
> -            self.s3_upload_index(s3_client, &data.name)
> -                .context("failed to upload fixed index to s3 backend")?;
> -            self.log(format!(
> -                "Uploaded fixed index file to object store: {}",
> -                data.name
> -            ))
> -        }
> +        state.file_counter += 1;
> +        state.backup_size += size;
> +        state.backup_stat = state.backup_stat + data.upload_stat;
> +        state.active_backend_operations += 1;
>  
>          self.log_upload_stat(
>              &data.name,
> @@ -586,9 +590,21 @@ impl BackupEnvironment {
>              &data.upload_stat,
>          );
>  
> -        state.file_counter += 1;
> -        state.backup_size += size;
> -        state.backup_stat = state.backup_stat + data.upload_stat;
> +        // never hold mutex guard during s3 upload due to possible deadlocks
> +        drop(state);
> +
> +        // For S3 backends, upload the index file to the object store after closing
> +        if let DatastoreBackend::S3(s3_client) = &self.backend {
> +            self.s3_upload_index(s3_client, &data.name)
> +                .context("failed to upload fixed index to s3 backend")?;
> +            self.log(format!(
> +                "Uploaded fixed index file to object store: {}",
> +                data.name
> +            ))
> +        }
> +
> +        let mut state = self.state.lock().unwrap();
> +        state.active_backend_operations -= 1;
>  
>          Ok(())
>      }
> @@ -645,6 +661,13 @@ impl BackupEnvironment {
>              bail!("found open index writer - unable to finish backup");
>          }
>  
> +        if state.active_backend_operations != 0 {
> +            bail!(
> +                "backup task still has {} active operations.",
> +                state.active_backend_operations,
> +            );
> +        }
> +
>          if state.file_counter == 0 {
>              bail!("backup does not contain valid files (file count == 0)");
>          }
> @@ -753,6 +776,12 @@ impl BackupEnvironment {
>          if !state.finished {
>              bail!("backup ended but finished flag is not set.");
>          }
> +        if state.active_backend_operations != 0 {
> +            bail!(
> +                "backup ended but {} active backend operations.",
> +                state.active_backend_operations,
> +            );
> +        }

there's now an inconsistency between ensure_finished(), which checks both
conditions, and finished() (used to determine whether a connection being
interrupted is benign or not!), which just checks the finished flag..

>          Ok(())
>      }
>  
> -- 
> 2.47.3
> 
> 
> 
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
> 
> 
> 




More information about the pbs-devel mailing list