[pve-devel] [PATCH qemu-server v5 14/16] fix #5440: vzdump: better cleanup fleecing images after hard errors
Fiona Ebner
f.ebner at proxmox.com
Mon Jan 27 12:29:21 CET 2025
By recording the allocated fleecing images in the VM config, they
are not immediately orphaned, should a hard error occur during
backup that prevents cleanup.
They are attempted to be cleaned up during the next backup run.
In the cleanup helper, check if fleecing images are still attached in
QEMU and detach them. This allows recovering from more failure
scenarios. However, to avoid a deadlock, a left-over backup job needs
to be canceled first. While canceling a left-over backup already
happens when cleanup is done for a subsquent backup, it is required
for other cases that like cleanup before migration (to be added in a
following commit).
Suggested-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
---
Changes in v5:
* cancel backup job if still running before detaching
* use special config section instead of config key
PVE/QemuConfig.pm | 82 ++++++++++++++++++++++++++++++++++++++++
PVE/VZDump/QemuServer.pm | 36 +++++++++++++++---
2 files changed, 112 insertions(+), 6 deletions(-)
diff --git a/PVE/QemuConfig.pm b/PVE/QemuConfig.pm
index 3d57a0a8..92747165 100644
--- a/PVE/QemuConfig.pm
+++ b/PVE/QemuConfig.pm
@@ -13,6 +13,7 @@ use PVE::QemuServer::Monitor qw(mon_cmd);
use PVE::QemuServer;
use PVE::QemuServer::Machine;
use PVE::QemuServer::Memory qw(get_current_memory);
+use PVE::RESTEnvironment qw(log_warn);
use PVE::Storage;
use PVE::Tools;
use PVE::Format qw(render_bytes render_duration);
@@ -578,4 +579,85 @@ sub has_cloudinit {
return $found;
}
+# Caller is expected to deal with volumes from an already existing 'fleecing' special section in the
+# configuration first.
+sub record_fleecing_images {
+ my ($vmid, $volids) = @_;
+
+ return if scalar($volids->@*) == 0;
+
+ PVE::QemuConfig->lock_config($vmid, sub {
+ my $conf = PVE::QemuConfig->load_config($vmid);
+ $conf->{'special-sections'}->{fleecing}->{'fleecing-images'} = join(',', $volids->@*);
+ PVE::QemuConfig->write_config($vmid, $conf);
+ });
+}
+
+# Will also cancel a running backup job inside QEMU. Not doing so can lead to a deadlock when
+# attempting to detach the fleecing image.
+sub cleanup_fleecing_images {
+ my ($vmid, $storecfg, $log_func) = @_;
+
+ if (!$log_func) {
+ $log_func = sub {
+ my ($level, $line) = @_;
+ chomp($line);
+ if ($level eq 'info') {
+ print "$line\n";
+ } else {
+ log_warn($line);
+ }
+ };
+ }
+
+ my $volids = [];
+ my $failed = [];
+
+ # cancel left-over backup job and detach any left-over images from a running VM
+ if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
+ eval {
+ if (my $status = mon_cmd($vmid, 'query-backup')) {
+ if ($status->{status} && $status->{status} eq 'active') {
+ $log_func->('warn', "left-over backup job still running inside QEMU - canceling now");
+ mon_cmd($vmid, 'backup-cancel');
+ }
+ }
+ };
+ $log_func->('warn', "checking/canceling old backup job failed - $@") if $@;
+
+ my $block_info = mon_cmd($vmid, "query-block");
+ for my $info ($block_info->@*) {
+ my $device_id = $info->{device};
+ next if $device_id !~ m/-fleecing$/;
+
+ $log_func->('info', "detaching (old) fleecing image for '$device_id'");
+ $device_id =~ s/^drive-//; # re-added by qemu_drivedel()
+ eval { PVE::QemuServer::qemu_drivedel($vmid, $device_id) };
+ $log_func->('warn', "error detaching (old) fleecing image '$device_id' - $@") if $@;
+ }
+ }
+
+ PVE::QemuConfig->lock_config($vmid, sub {
+ my $conf = PVE::QemuConfig->load_config($vmid);
+ my $special = $conf->{'special-sections'};
+ if (my $fleecing = $special->{fleecing}) {
+ $volids = [PVE::Tools::split_list($fleecing->{'fleecing-images'})];
+ delete $fleecing->{'fleecing-images'};
+ delete $special->{fleecing} if !scalar(keys $fleecing->%*);
+ PVE::QemuConfig->write_config($vmid, $conf);
+ }
+ });
+
+ for my $volid ($volids->@*) {
+ $log_func->('info', "removing (old) fleecing image '$volid'");
+ eval { PVE::Storage::vdisk_free($storecfg, $volid); };
+ if (my $err = $@) {
+ $log_func->('warn', "error removing fleecing image '$volid' - $err");
+ push $failed->@*, $volid;
+ }
+ }
+
+ record_fleecing_images($vmid, $failed);
+}
+
1;
diff --git a/PVE/VZDump/QemuServer.pm b/PVE/VZDump/QemuServer.pm
index cdaaa3a2..55325217 100644
--- a/PVE/VZDump/QemuServer.pm
+++ b/PVE/VZDump/QemuServer.pm
@@ -533,15 +533,25 @@ sub get_and_check_pbs_encryption_config {
die "internal error - unhandled case for getting & checking PBS encryption ($keyfile, $master_keyfile)!";
}
+# Helper is intended to be called from allocate_fleecing_images() only. Otherwise, fleecing volids
+# have already been recorded in the configuration and PVE::QemuConfig::cleanup_fleecing_images()
+# should be used instead.
my sub cleanup_fleecing_images {
- my ($self, $disks) = @_;
+ my ($self, $vmid, $disks) = @_;
+
+ my $failed = [];
for my $di ($disks->@*) {
if (my $volid = $di->{'fleece-volid'}) {
eval { PVE::Storage::vdisk_free($self->{storecfg}, $volid); };
- $self->log('warn', "error removing fleecing image '$volid' - $@") if $@;
+ if (my $err = $@) {
+ $self->log('warn', "error removing fleecing image '$volid' - $err");
+ push $failed->@*, $volid;
+ }
}
}
+
+ PVE::QemuConfig::record_fleecing_images($vmid, $failed);
}
my sub allocate_fleecing_images {
@@ -549,8 +559,7 @@ my sub allocate_fleecing_images {
die "internal error - no fleecing storage specified\n" if !$fleecing_storeid;
- # TODO what about potential left-over images from a failed attempt? Just
- # auto-remove? While unlikely, could conflict with manually created image from user...
+ my $fleece_volids = [];
eval {
my $n = 0; # counter for fleecing image names
@@ -567,6 +576,8 @@ my sub allocate_fleecing_images {
$di->{'fleece-volid'} = PVE::Storage::vdisk_alloc(
$self->{storecfg}, $fleecing_storeid, $vmid, $format, $name, $size);
+ push $fleece_volids->@*, $di->{'fleece-volid'};
+
$n++;
} else {
die "implement me (type '$di->{type}')";
@@ -574,9 +585,11 @@ my sub allocate_fleecing_images {
}
};
if (my $err = $@) {
- cleanup_fleecing_images($self, $disks);
+ cleanup_fleecing_images($self, $vmid, $disks);
die $err;
}
+
+ PVE::QemuConfig::record_fleecing_images($vmid, $fleece_volids);
}
my sub detach_fleecing_images {
@@ -636,6 +649,13 @@ my sub check_and_prepare_fleecing {
$use_fleecing = 0;
}
+ # clean up potential left-overs from a previous attempt
+ eval {
+ PVE::QemuConfig::cleanup_fleecing_images(
+ $vmid, $self->{storecfg}, sub { $self->log($_[0], $_[1]); });
+ };
+ $self->log('warn', "attempt to clean up left-over fleecing images failed - $@") if $@;
+
if ($use_fleecing) {
my ($default_format, $valid_formats) = PVE::Storage::storage_default_format(
$self->{storecfg}, $fleecing_opts->{storage});
@@ -1132,7 +1152,11 @@ sub cleanup {
# If VM was started only for backup, it is already stopped now.
if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
$detach_tpmstate_drive->($task, $vmid);
- detach_fleecing_images($task->{disks}, $vmid) if $task->{'use-fleecing'};
+ if ($task->{'use-fleecing'}) {
+ detach_fleecing_images($task->{disks}, $vmid);
+ PVE::QemuConfig::cleanup_fleecing_images(
+ $vmid, $self->{storecfg}, sub { $self->log($_[0], $_[1]); });
+ }
}
cleanup_fleecing_images($self, $task->{disks}) if $task->{'use-fleecing'};
--
2.39.5
More information about the pve-devel
mailing list