[pve-devel] [PATCH v2 qemu-server 13/13] migration: move finishing block jobs to phase2 for better/uniform error handling

Fri Jan 29 16:11:43 CET 2021

avoids the possibility to die during phase3_cleanup and instead of needing to
duplicate the cleanup ourselves, benefit from phase2_cleanup doing so.

The duplicate cleanup was also very incomplete: it didn't stop the remote kvm
process (leading to 'VM already running' when trying to migrate again
afterwards), but it removed its disks, and it didn't unlock the config, didn't
close the tunnel and didn't cancel the block-dirty bitmaps.

Since migrate_cancel should do nothing after the (non-storage) migrate process
has completed, even that cleanup step is fine here.

Since phase3 is empty at the moment, the order of operations is still the same.

Also add a test, that would complain about finish_tunnel not being called before
this patch. That test also checks that local disks are not already removed
before finishing the block jobs.

Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
---

New in v2

The test would also expose the temporary breakage with the wrong #8/#9 patch
order

With and without this patch: When dying here, i.e. when finishing the
block jobs, the VM is in a blocked state afterwards (postmigrate), because the
(non-storage) migration was successful. Simply resuming it seems to work just
fine, would it be worth to add a (guarded) resume call in the cleanup too?

 PVE/QemuMigrate.pm                    | 23 ++++++++----------
 test/MigrationTest/QemuMigrateMock.pm |  6 +++++
 test/run_qemu_migrate_tests.pl        | 35 +++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/PVE/QemuMigrate.pm b/PVE/QemuMigrate.pm
index b503601..435c1f7 100644
--- a/PVE/QemuMigrate.pm
+++ b/PVE/QemuMigrate.pm
@@ -1134,6 +1134,16 @@ sub phase2 {
 	    die "unable to parse migration status '$stat->{status}' - aborting\n";
 	}
     }
+
+    if ($self->{storage_migration}) {
+	# finish block-job with block-job-cancel, to disconnect source VM from NBD
+	# to avoid it trying to re-establish it. We are in blockjob ready state,
+	# thus, this command changes to it to blockjob complete (see qapi docs)
+	eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); };
+	if (my $err = $@) {
+	    die "Failed to complete storage migration: $err\n";
+	}
+    }
 }
 
 sub phase2_cleanup {
@@ -1209,19 +1219,6 @@ sub phase3_cleanup {
 
     my $tunnel = $self->{tunnel};
 
-    if ($self->{storage_migration}) {
-	# finish block-job with block-job-cancel, to disconnect source VM from NBD
-	# to avoid it trying to re-establish it. We are in blockjob ready state,
-	# thus, this command changes to it to blockjob complete (see qapi docs)
-	eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); };
-
-	if (my $err = $@) {
-	    eval { PVE::QemuServer::qemu_blockjobs_cancel($vmid, $self->{storage_migration_jobs}) };
-	    eval { PVE::QemuMigrate::cleanup_remotedisks($self) };
-	    die "Failed to complete storage migration: $err\n";
-	}
-    }
-
     if ($self->{volume_map}) {
 	my $target_drives = $self->{target_drive};
 
diff --git a/test/MigrationTest/QemuMigrateMock.pm b/test/MigrationTest/QemuMigrateMock.pm
index 2d424e0..8e0b7d0 100644
--- a/test/MigrationTest/QemuMigrateMock.pm
+++ b/test/MigrationTest/QemuMigrateMock.pm
@@ -139,6 +139,12 @@ $MigrationTest::Shared::qemu_server_module->mock(
 	file_set_contents("${RUN_DIR_PATH}/nbd_info", to_json($nbd_info));
     },
     qemu_drive_mirror_monitor => sub {
+	my ($vmid, $vmiddst, $jobs, $completion, $qga) = @_;
+
+	if ($fail_config->{qemu_drive_mirror_monitor} &&
+	    $fail_config->{qemu_drive_mirror_monitor} eq $completion) {
+	    die "qemu_drive_mirror_monitor '$completion' error\n";
+	}
 	return;
     },
     set_migration_caps => sub {
diff --git a/test/run_qemu_migrate_tests.pl b/test/run_qemu_migrate_tests.pl
index 4f7f021..5edea7b 100755
--- a/test/run_qemu_migrate_tests.pl
+++ b/test/run_qemu_migrate_tests.pl
@@ -1444,6 +1444,41 @@ my $tests = [
 	    },
 	},
     },
+    {
+	name => '149_running_unused_block_job_cancel_fail',
+	target => 'pve1',
+	vmid => 149,
+	vm_status => {
+	    running => 1,
+	    runningmachine => 'pc-q35-5.0+pve0',
+	},
+	opts => {
+	    online => 1,
+	    'with-local-disks' => 1,
+	},
+	config_patch => {
+	    scsi1 => undef,
+	    unused0 => 'local-dir:149/vm-149-disk-0.qcow2',
+	},
+	expected_calls => {},
+	expect_die => "qemu_drive_mirror_monitor 'cancel' error",
+	# note that 'cancel' is also used to finish and that's what this test is about
+	fail_config => {
+	    'qemu_drive_mirror_monitor' => 'cancel',
+	},
+	expected => {
+	    source_volids => local_volids_for_vm(149),
+	    target_volids => {},
+	    vm_config => get_patched_config(149, {
+		scsi1 => undef,
+		unused0 => 'local-dir:149/vm-149-disk-0.qcow2',
+	    }),
+	    vm_status => {
+		running => 1,
+		runningmachine => 'pc-q35-5.0+pve0',
+	    },
+	},
+    },
     {
 	name => '149_offline',
 	target => 'pve1',
-- 
2.20.1