[pbs-devel] [PATCH proxmox-backup v1 1/3] fix #3935: datastore/api2/backup: move datastore locking to '/run'

Wolfgang Bumiller w.bumiller at proxmox.com
Thu Apr 7 10:45:21 CEST 2022


On Wed, Apr 06, 2022 at 11:39:53AM +0200, Stefan Sterz wrote:
(...)
> @@ -308,27 +308,28 @@ impl DataStore {
>  
>          if removed_all {
>              // no snapshots left, we can now safely remove the empty folder
> -            std::fs::remove_dir_all(&full_path)
> -                .map_err(|err| {
> -                    format_err!(
> -                        "removing backup group directory {:?} failed - {}",
> -                        full_path,
> -                        err,
> -                    )
> -                })?;
> +            std::fs::remove_dir_all(&full_path).map_err(|err| {
> +                format_err!(
> +                    "removing backup group directory {:?} failed - {}",
> +                    full_path,
> +                    err,
> +                )
> +            })?;

The above looks like a pure indentation change, unless I'm missing
something. If I'm right, please try to avoid this in an otherwise
functionality-changing patch.

> +
> +            if let Ok(path) = self.group_lock_path(backup_group) {

The only possible failure here is if *creating* the intermediate paths
fails. I'm not a friend of this, see my comment at the `_path()`
functions below...

> +                let _ = std::fs::remove_file(path);
> +            }
>          }
>  
>          Ok(removed_all)
>      }
>  
>      /// Remove a backup directory including all content
> -    pub fn remove_backup_dir(&self, backup_dir: &BackupDir, force: bool) ->  Result<(), Error> {
> -
> -        let full_path = self.snapshot_path(backup_dir);
> -
> +    pub fn remove_backup_dir(&self, backup_dir: &BackupDir, force: bool) -> Result<(), Error> {
>          let (_guard, _manifest_guard);
> +
>          if !force {
> -            _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?;
> +            _guard = self.lock_snapshot(backup_dir)?;
>              _manifest_guard = self.lock_manifest(backup_dir)?;
>          }
>  
> @@ -336,6 +337,8 @@ impl DataStore {
>              bail!("cannot remove protected snapshot");
>          }
>  
> +        let full_path = self.snapshot_path(backup_dir);
> +
>          log::info!("removing backup snapshot {:?}", full_path);
>          std::fs::remove_dir_all(&full_path)
>              .map_err(|err| {
> @@ -352,6 +355,10 @@ impl DataStore {
>              let _ = std::fs::remove_file(path);
>          }
>  
> +        if let Ok(path) = self.snapshot_lock_path(backup_dir) {

(same)

> +            let _ = std::fs::remove_file(path);
> +        }
> +
>          Ok(())
>      }
>  
(...)
> @@ -952,4 +957,72 @@ impl DataStore {
>  
>          Ok(chunk_list)
>      }
> -}
> +
> +    fn snapshot_lock_path(&self, backup_dir: &BackupDir) -> Result<PathBuf, Error> {
> +        let path = Path::new(DATASTORE_LOCKS_DIR).join(self.name());
> +        std::fs::create_dir_all(&path)?;

Given the name, this function is to query a path. Apart from the helpers
which also create the lock files, the only other current user doesn't
actually *need* the path created, since it only wants to remove it.

I think we should leave the mkdir up to the `fn lock_helper` and drop
the `Result` here.

> +
> +        // hash relative path to get a fixed length unique file name
> +        let rpath_hash =
> +            openssl::sha::sha256(backup_dir.relative_path().to_str().unwrap().as_bytes());

Please use `.as_os_str().as_bytes()` via the `OsStrExt` trait.
Never use `.to_str().unwrap()` on paths.

> +        Ok(path.join(hex::encode(rpath_hash)))
> +    }
> +
> +    fn group_lock_path(&self, group: &BackupGroup) -> Result<PathBuf, Error> {

Same for this function as for `snapshot_lock_path()` above.

> +        let path = Path::new(DATASTORE_LOCKS_DIR).join(self.name());
> +        std::fs::create_dir_all(&path)?;
> +
> +        // hash relative path to get a fixed length unique file name
> +        let rpath_hash = openssl::sha::sha256(group.group_path().to_str().unwrap().as_bytes());
> +        Ok(path.join(hex::encode(rpath_hash)))
> +    }
> +
> +    // this function helps implement the double stat'ing procedure. it
> +    // avoids certain race conditions upon lock deletion.
> +    fn lock_helper<F>(&self, path: &std::path::Path, lock_fn: F) -> Result<BackupLockGuard, Error>
> +    where
> +        F: Fn(&std::path::Path) -> Result<BackupLockGuard, Error>,
> +    {

This is where I'd put the `create_dir_all` part instead.

Also, instead of a path based stat here:

> +        let stat = match nix::sys::stat::stat(path) {
> +            Ok(result) => result.st_ino,
> +            // lock hasn't been created yet, ignore check
> +            Err(e) if e.not_found() => return Ok(lock_fn(path)?),
> +            Err(e) => bail!("could not stat lock file before locking! - {}", e),
> +        };
> +
> +        let lock = lock_fn(path)?;

I'd prefer an `fstat` on the file handle here so the path access happens
only twice instead of 3 times.

For that, you can `impl AsRawFd for BackupLockGuard` (which can just
return -1 for when its inner option is `None`. (Which it shouldn't even
have actually and I wonder if the `cfg(test)` in the tape inventory code
(which is the reason for the Option apparently) could be changed further
so that it doesn't actually use this particular type at all in tests,
but that's a different issue.)

> +
> +        if stat != nix::sys::stat::stat(path)?.st_ino {

Please don't use '?' here because if the file is currently removed we
also want the same nice error message. Something like
`stat().map(|st| st.st_ino != stat).unwrap_or(true)` might work.

> +            bail!("could not acquire lock, another thread modified the lock file");
> +        }
> +
> +        Ok(lock)
> +    }
> +
> +    /// Lock a snapshot exclusively
> +    pub fn lock_snapshot(&self, backup_dir: &BackupDir) -> Result<BackupLockGuard, Error> {
> +        self.lock_helper(&self.snapshot_lock_path(backup_dir)?, |p| {
> +            open_backup_lockfile(p, Some(Duration::from_secs(0)), true).map_err(|err| {
> +                format_err!("unable to acquire snapshot lock {:?} - {}", &p, err)
> +            })
> +        })
> +    }
> +
> +    /// Acquire a shared lock on a snapshot
> +    pub fn lock_snapshot_shared(&self, backup_dir: &BackupDir) -> Result<BackupLockGuard, Error> {
> +        self.lock_helper(&self.snapshot_lock_path(backup_dir)?, |p| {
> +            open_backup_lockfile(p, Some(Duration::from_secs(0)), false).map_err(|err| {
> +                format_err!("unable to acquire shared snapshot lock {:?} - {}", &p, err)
> +            })
> +        })
> +    }
> +
> +    /// Lock a group exclusively
> +    pub fn lock_group(&self, group: &BackupGroup) -> Result<BackupLockGuard, Error> {
> +        self.lock_helper(&self.group_lock_path(group)?, |p| {
> +            open_backup_lockfile(p, Some(Duration::from_secs(0)), true).map_err(|err| {
> +                format_err!("unable to acquire backup group lock {:?} - {}", &p, err)
> +            })
> +        })
> +    }
> +}
> \ No newline at end of file





More information about the pbs-devel mailing list