[pbs-devel] [PATCH v3 proxmox-backup] garbage collection: fix rare race in chunk marking phase
Christian Ebner
c.ebner at proxmox.com
Wed Apr 16 08:12:18 CEST 2025
On 4/15/25 16:49, Christian Ebner wrote:
> During phase 1 of garbage collection referenced chunks are marked as
> in use by iterating over all index files and updating the atime on
> the chunks referenced by these.
>
> In an edge case for long running garbage collection jobs, where a
> newly added snapshot (created after the start of GC) reused known
> chunks from a previous snapshot, but the previous snapshot index
> referencing them disappeared before the marking phase could reach
> that index (e.g. pruned because only 1 snapshot to be kept by
> retention setting), known chunks from that previous index file might
> not be marked (given that by none of the other index files it was
> marked).
>
> Since commit 74361da8 ("garbage collection: generate index file list
> via datastore iterators") this is even less likely as now the
> iteration reads also index files added during phase 1, and
> therefore either the new or the previous index file will account for
> these chunks (the previous backup snapshot can only be prunded after
> the new one finished, since locked). There remains however a small
> race window between the reading of the snapshots in the backup group
> and the reading of the actual index files for marking.
>
> Fix this race by:
> 1. Checking if the last snapshot of a group disappeared and if so
> 2. generate the list again, looking for new index files previously
> not accounted for
> 3. To avoid possible endless looping, lock the group if the snapshot
> list changed even after the 10th time (which will lead to
> concurrent operations to this group failing).
>
> Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
> ---
> changes since version 2:
> - replace needs_retry variable by labeled loop operations
> - check if lock could actually be acquired in last restort branch, fail
> otherwise
> - include catchall case for retry counter overrun
> - catch case where snapshot listing fails if group vanished, only fail
> if the group still exists on the filesystem.
>
> pbs-datastore/src/datastore.rs | 116 +++++++++++++++++++++++----------
> 1 file changed, 80 insertions(+), 36 deletions(-)
>
> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
> index aa38e2ac1..8ef5c6860 100644
> --- a/pbs-datastore/src/datastore.rs
> +++ b/pbs-datastore/src/datastore.rs
> @@ -1143,47 +1143,91 @@ impl DataStore {
> let namespace = namespace.context("iterating namespaces failed")?;
> for group in arc_self.iter_backup_groups(namespace)? {
> let group = group.context("iterating backup groups failed")?;
> - let mut snapshots = group.list_backups().context("listing snapshots failed")?;
> - // Sort by snapshot timestamp to iterate over consecutive snapshots for each image.
> - BackupInfo::sort_list(&mut snapshots, true);
> - for snapshot in snapshots {
> - for file in snapshot.files {
> - worker.check_abort()?;
> - worker.fail_on_shutdown()?;
> -
> - let mut path = snapshot.backup_dir.full_path();
> - path.push(file);
> -
> - let index = match self.open_index_reader(&path)? {
> - Some(index) => index,
> - None => {
> - unprocessed_index_list.remove(&path);
> - continue;
> +
> + // Avoid race between listing/marking of snapshots by GC and pruning the last
> + // snapshot in the group, following a new snapshot creation. Otherwise known chunks
> + // might only be referenced by the new snapshot, so it must be read as well.
> + let mut retry_counter = 0;
> + let mut processed_group_indices = HashSet::new();
> + 'retry: loop {
> + let _lock = match retry_counter {
> + 0..=9 => None,
> + 10 => Some(
> + group
> + .lock()
> + .context("exhausted retries and failed to lock group")?,
> + ),
> + _ => bail!("exhausted retires and unexpected counter overrun"),
> + };
> +
> + let mut snapshots = match group.list_backups() {
> + Ok(snapshots) => snapshots,
> + Err(err) => {
> + if group.exists() {
> + return Err(err).context("listing snapshots failed")?;
> }
> - };
> - self.index_mark_used_chunks(
> - index,
> - &path,
> - &mut chunk_lru_cache,
> - status,
> - worker,
> - )?;
> -
> - if !unprocessed_index_list.remove(&path) {
> - info!("Encountered new index file '{path:?}', increment total index file count");
> - index_count += 1;
> + break 'retry;
> }
> + };
> +
> + let snapshot_count = snapshots.len();
> + BackupInfo::sort_list(&mut snapshots, true);
> + for (count, snapshot) in snapshots.into_iter().enumerate() {
This can be further optimized and reduced in complexity by iterating in
reverse order (from newest to oldest). That has the following advantages:
- Reduces the window between the list generation and reading the last
snapshots in the group.
- Allows to get rid of the processed_group_indices set, as now either
one has to continue the outer loop anyways if the last snapshot cannot
be accessed
- The subset of chunks shared between consecutive snapshots remains
unchanged, so the cache to avoid multiple atime updates will have an
similar hit/miss ratio.
Will adapt this and send a new version of the patch.
> + for file in snapshot.files {
> + worker.check_abort()?;
> + worker.fail_on_shutdown()?;
> +
> + match ArchiveType::from_path(&file) {
> + Ok(ArchiveType::FixedIndex) | Ok(ArchiveType::DynamicIndex) => (),
> + Ok(ArchiveType::Blob) | Err(_) => continue,
> + };
> +
> + let mut path = snapshot.backup_dir.full_path();
> + path.push(file);
> +
> + // Avoid reprocessing of already seen index files on retry
> + if retry_counter > 0 && processed_group_indices.contains(&path) {
> + continue;
> + }
> +
> + let index = match self.open_index_reader(&path)? {
> + Some(index) => index,
> + None => {
> + unprocessed_index_list.remove(&path);
> + if count + 1 == snapshot_count {
> + retry_counter += 1;
> + continue 'retry;
> + }
> + continue;
> + }
> + };
> + self.index_mark_used_chunks(
> + index,
> + &path,
> + &mut chunk_lru_cache,
> + status,
> + worker,
> + )?;
> + processed_group_indices.insert(path.clone());
> +
> + if !unprocessed_index_list.remove(&path) {
> + info!("Encountered new index file '{path:?}', increment total index file count");
> + index_count += 1;
> + }
>
> - let percentage = (processed_index_files + 1) * 100 / index_count;
> - if percentage > last_percentage {
> - info!(
> - "marked {percentage}% ({} of {index_count} index files)",
> - processed_index_files + 1,
> - );
> - last_percentage = percentage;
> + let percentage = (processed_index_files + 1) * 100 / index_count;
> + if percentage > last_percentage {
> + info!(
> + "marked {percentage}% ({} of {index_count} index files)",
> + processed_index_files + 1,
> + );
> + last_percentage = percentage;
> + }
> + processed_index_files += 1;
> }
> - processed_index_files += 1;
> }
> +
> + break;
> }
> }
> }
More information about the pbs-devel
mailing list