[pve-devel] [PATCH v2 qemu-server 3/3] migrate: add live-migration of replicated disks

Tue Mar 17 08:55:27 CET 2020

with incremental drive-mirror and dirty-bitmap tracking.

1.) get replicated disks that are currently referenced by running VM
2.) add a block-dirty-bitmap to each of them
3.) replicate ALL replicated disks
4.) pass bitmaps from 2) to drive-mirror for disks from 1)
5.) skip replicated disks when cleaning up volumes on either source or
target

added error handling is just removing the bitmaps if an error occurs at
any point after 2, except when the handover to the target node has
already happened, since the bitmaps are cleaned up together with the
source VM in that case.

Signed-off-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
---

Notes:
    v1->v2:
    - rebased and fixed some conflicts

    for Qemu 4.2 you probably want the 'block-job-cancel instead of
    blockjob-complete' change recently discussed in response to Mira's UNIX
    migration series as well, otherwise shutting down the source VM will
    sometimes hang/need a forceful termination when using NBD.
    
    tested with single and multiple replicated disks, with and without (other)
    - local, replicated, unreferenced disks
    - local, non-replicated referenced disks
    - local, non-replicated unreferenced disks
    - shared disks
    
    and with switching targetstorage (obviously the replicated disks don't switch
    storage in that case, but the others do as before ;))
    
    I tested writes from within the VM during both 'hot' phases:
    - between adding the bitmap and replication
    - between replication and drive-mirror
    
    one thing to note is that since we add the bitmaps before making the
    replication (snapshots), they will contain some writes that have already been
    part of the last replication. AFAICT this should not be a problem, but we
    could also play it safer and do a
    
    - freeze
    - add bitmaps
    - start replication (which will unfreeze after taking the snapshots)
    
    sequence?
    
    the bitmap info is stored in the same hash like the other live local disk
    migration stuff, even though it comes from a different source and at a
    different time. I initially had it in a separate hash in $self->{bitmaps}, both
    variants are equally unelegant IMHO :P

 PVE/QemuMigrate.pm | 52 ++++++++++++++++++++++++++++++++++++++++++++--
 PVE/QemuServer.pm  |  7 +++++++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/PVE/QemuMigrate.pm b/PVE/QemuMigrate.pm
index d874760..7ae8246 100644
--- a/PVE/QemuMigrate.pm
+++ b/PVE/QemuMigrate.pm
@@ -444,8 +444,30 @@ sub sync_disks {
 
 	my $rep_cfg = PVE::ReplicationConfig->new();
 	if (my $jobcfg = $rep_cfg->find_local_replication_job($vmid, $self->{node})) {
-	    die "can't live migrate VM with replicated volumes\n" if $self->{running};
+	    if ($self->{running}) {
+		my $live_replicatable_volumes = {};
+		PVE::QemuServer::foreach_drive($conf, sub {
+		    my ($ds, $drive) = @_;
+
+		    my $volid = $drive->{file};
+		    $live_replicatable_volumes->{$ds} = $volid
+			if defined($replicatable_volumes->{$volid});
+		});
+		foreach my $drive (keys %$live_replicatable_volumes) {
+		    my $volid = $live_replicatable_volumes->{$drive};
+
+		    my $bitmap = "repl_$drive";
+
+		    # start tracking before replication to get full delta + a few duplicates
+		    $self->log('info', "$drive: start tracking writes using block-dirty-bitmap '$bitmap'");
+		    mon_cmd($vmid, 'block-dirty-bitmap-add', node => "drive-$drive", name => $bitmap);
+
+		    # other info comes from target node in phase 2
+		    $self->{target_drive}->{$drive}->{bitmap} = $bitmap;
+		}
+	    }
 	    $self->log('info', "replicating disk images");
+
 	    my $start_time = time();
 	    my $logfunc = sub { $self->log('info', shift) };
 	    $self->{replicated_volumes} = PVE::Replication::run_replication(
@@ -500,6 +522,8 @@ sub cleanup_remotedisks {
     my ($self) = @_;
 
     foreach my $target_drive (keys %{$self->{target_drive}}) {
+	# don't clean up replicated disks!
+	next if defined($self->{target_drive}->{$target_drive}->{bitmap});
 
 	my $drive = PVE::QemuServer::parse_drive($target_drive, $self->{target_drive}->{$target_drive}->{drivestr});
 	my ($storeid, $volname) = PVE::Storage::parse_volume_id($drive->{file});
@@ -514,6 +538,16 @@ sub cleanup_remotedisks {
     }
 }
 
+sub cleanup_bitmaps {
+    my ($self) = @_;
+    foreach my $drive (%{$self->{target_drive}}) {
+	my $bitmap = $self->{target_drive}->{$drive}->{bitmap};
+	next if !$bitmap;
+	$self->log('info', "$drive: removing block-dirty-bitmap '$bitmap'");
+	mon_cmd($self->{vmid}, 'block-dirty-bitmap-remove', node => "drive-$drive", name => $bitmap);
+    }
+}
+
 sub phase1 {
     my ($self, $vmid) = @_;
 
@@ -550,6 +584,12 @@ sub phase1_cleanup {
 	    # fixme: try to remove ?
 	}
     }
+
+    eval { $self->cleanup_bitmaps() };
+    if (my $err =$@) {
+	$self->log('err', $err);
+    }
+
 }
 
 sub phase2 {
@@ -704,9 +744,10 @@ sub phase2 {
 	    my $target_sid = PVE::Storage::Plugin::parse_volume_id($target_drive->{file});
 
 	    my $bwlimit = PVE::Storage::get_bandwidth_limit('migration', [$source_sid, $target_sid], $opt_bwlimit);
+	    my $bitmap = $target->{bitmap};
 
 	    $self->log('info', "$drive: start migration to $nbd_uri");
-	    PVE::QemuServer::qemu_drive_mirror($vmid, $drive, $nbd_uri, $vmid, undef, $self->{storage_migration_jobs}, 1, undef, $bwlimit);
+	    PVE::QemuServer::qemu_drive_mirror($vmid, $drive, $nbd_uri, $vmid, undef, $self->{storage_migration_jobs}, 1, undef, $bwlimit, $bitmap);
 	}
     }
 
@@ -924,6 +965,10 @@ sub phase2_cleanup {
 	if (my $err = $@) {
 	    $self->log('err', $err);
 	}
+	eval { $self->cleanup_bitmaps() };
+	if (my $err =$@) {
+	    $self->log('err', $err);
+	}
     }
 
     my $nodename = PVE::INotify::nodename();
@@ -1083,6 +1128,9 @@ sub phase3_cleanup {
 	my $volids = $self->{online_local_volumes};
 
 	foreach my $volid (@$volids) {
+	    # keep replicated volumes!
+	    next if $self->{replicated_volumes}->{$volid};
+
 	    eval { PVE::Storage::vdisk_free($self->{storecfg}, $volid); };
 	    if (my $err = $@) {
 		$self->log('err', "removing local copy of '$volid' failed - $err");
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index 83033df..5b449b7 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -4756,11 +4756,18 @@ sub vm_start {
 		$local_volumes->{$ds} = [$volid, $storeid, $volname];
 	    });
 
+	    my $replicatable_volumes = PVE::QemuConfig->get_replicatable_volumes($storecfg, $vmid, $conf);
+
 	    my $format = undef;
 
 	    foreach my $opt (sort keys %$local_volumes) {
 
 		my ($volid, $storeid, $volname) = @{$local_volumes->{$opt}};
+		if ($replicatable_volumes->{$volid}) {
+		    # re-use existing, replicated volume with bitmap on source side
+		    $local_volumes->{$opt} = $conf->{${opt}};
+		    next;
+		}
 		my $drive = parse_drive($opt, $conf->{$opt});
 
 		# If a remote storage is specified and the format of the original
-- 
2.20.1