[pve-devel] [PATCH 2/6] qemu_drive_mirror : handle multiple jobs

Thu Nov 10 13:54:22 CET 2016

> + die "can't connect remote nbd server $server:$port" if !PVE::Network::tcp_ping($server, $port, 2); 
>>I'm not all too happy about this check here as it is a TOCTOU race. 
>>(I'm not happy about some the other uses of it as well, but some are 
>>only for status quieries, iow. to display information, where it's fine.) 

>>However, in this case, if broken/missing connections can still not be 
>>caught (like in my previous tests), then this only hides 99.99% of the 
>>cases while still wrongly deleting disks in the other 0.01%, which is 
>>unacceptable behavior. 

So, do you want that I remove the check ?

>>$jobs is still empty at this point. The assignment below should be moved 
>>up. 

> + die "mirroring error: $err"; 
> + } 
> + 
> + $jobs->{"drive-$drive"} = {}; 
>>This one ^ 

Ok.

Thanks for the review!

----- Mail original -----
De: "Wolfgang Bumiller" <w.bumiller at proxmox.com>
À: "aderumier" <aderumier at odiso.com>
Cc: "pve-devel" <pve-devel at pve.proxmox.com>
Envoyé: Jeudi 10 Novembre 2016 13:21:00
Objet: Re: [pve-devel] [PATCH 2/6] qemu_drive_mirror : handle multiple jobs

On Tue, Nov 08, 2016 at 04:29:30AM +0100, Alexandre Derumier wrote: 
> we can use multiple drive_mirror in parralel. 
> 
> block-job-complete can be skipped, if we want to add more mirror job later. 
> 
> also add support for nbd uri to qemu_drive_mirror 
> 
> Signed-off-by: Alexandre Derumier <aderumier at odiso.com> 
> --- 
> PVE/QemuServer.pm | 171 +++++++++++++++++++++++++++++++++++++++--------------- 
> 1 file changed, 123 insertions(+), 48 deletions(-) 
> 
> diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm 
> index 54edd96..e989670 100644 
> --- a/PVE/QemuServer.pm 
> +++ b/PVE/QemuServer.pm 
> @@ -5824,91 +5824,165 @@ sub qemu_img_format { 
> } 
> 
> sub qemu_drive_mirror { 
> - my ($vmid, $drive, $dst_volid, $vmiddst, $is_zero_initialized) = @_; 
> + my ($vmid, $drive, $dst_volid, $vmiddst, $is_zero_initialized, $jobs, $skipcomplete) = @_; 
> 
> - my $storecfg = PVE::Storage::config(); 
> - my ($dst_storeid, $dst_volname) = PVE::Storage::parse_volume_id($dst_volid); 
> + $jobs = {} if !$jobs; 
> + 
> + my $qemu_target; 
> + my $format; 
> 
> - my $dst_scfg = PVE::Storage::storage_config($storecfg, $dst_storeid); 
> + if($dst_volid =~ /^nbd:(localhost|[\d\.]+|\[[\d\.:a-fA-F]+\]):(\d+)/) { 
> + $qemu_target = $dst_volid; 
> + my $server = $1; 
> + my $port = $2; 
> + $format = "nbd"; 
> + die "can't connect remote nbd server $server:$port" if !PVE::Network::tcp_ping($server, $port, 2); 

I'm not all too happy about this check here as it is a TOCTOU race. 
(I'm not happy about some the other uses of it as well, but some are 
only for status quieries, iow. to display information, where it's fine.) 

However, in this case, if broken/missing connections can still not be 
caught (like in my previous tests), then this only hides 99.99% of the 
cases while still wrongly deleting disks in the other 0.01%, which is 
unacceptable behavior. 

> + } else { 
> + 
> + my $storecfg = PVE::Storage::config(); 
> + my ($dst_storeid, $dst_volname) = PVE::Storage::parse_volume_id($dst_volid); 
> + 
> + my $dst_scfg = PVE::Storage::storage_config($storecfg, $dst_storeid); 
> 
> - my $format = qemu_img_format($dst_scfg, $dst_volname); 
> + $format = qemu_img_format($dst_scfg, $dst_volname); 
> 
> - my $dst_path = PVE::Storage::path($storecfg, $dst_volid); 
> + my $dst_path = PVE::Storage::path($storecfg, $dst_volid); 
> 
> - my $qemu_target = $is_zero_initialized ? "zeroinit:$dst_path" : $dst_path; 
> + $qemu_target = $is_zero_initialized ? "zeroinit:$dst_path" : $dst_path; 
> + } 
> 
> my $opts = { timeout => 10, device => "drive-$drive", mode => "existing", sync => "full", target => $qemu_target }; 
> $opts->{format} = $format if $format; 
> 
> - print "drive mirror is starting (scanning bitmap) : this step can take some minutes/hours, depend of disk size and storage speed\n"; 
> + print "drive mirror is starting for drive-$drive\n"; 
> 
> - my $finish_job = sub { 
> - while (1) { 
> - my $stats = vm_mon_cmd($vmid, "query-block-jobs"); 
> - my $stat = @$stats[0]; 
> - last if !$stat; 
> - sleep 1; 
> - } 
> - }; 
> + eval { vm_mon_cmd($vmid, "drive-mirror", %$opts); }; #if a job already run for this device,it's throw an error 
> + if (my $err = $@) { 
> + eval { PVE::QemuServer::qemu_blockjobs_cancel($vmid, $jobs) }; 

$jobs is still empty at this point. The assignment below should be moved 
up. 

> + die "mirroring error: $err"; 
> + } 
> + 
> + $jobs->{"drive-$drive"} = {}; 
This one ^ 

> + 
> + qemu_drive_mirror_monitor ($vmid, $vmiddst, $jobs, $skipcomplete); 
> +} 
> + 
> +sub qemu_drive_mirror_monitor { 
> + my ($vmid, $vmiddst, $jobs, $skipcomplete) = @_; 
> 
> eval { 
> - vm_mon_cmd($vmid, "drive-mirror", %$opts); 
> + 
> + my $err_complete = 0; 
> + 
> while (1) { 
> + die "storage migration timed out\n" if $err_complete > 300; 
> + 
> my $stats = vm_mon_cmd($vmid, "query-block-jobs"); 
> - my $stat = @$stats[0]; 
> - die "mirroring job seem to have die. Maybe do you have bad sectors?" if !$stat; 
> - die "error job is not mirroring" if $stat->{type} ne "mirror"; 
> 
> - my $busy = $stat->{busy}; 
> - my $ready = $stat->{ready}; 
> + my $running_mirror_jobs = {}; 
> + foreach my $stat (@$stats) { 
> + next if $stat->{type} ne 'mirror'; 
> + $running_mirror_jobs->{$stat->{device}} = $stat; 
> + } 
> 
> - if (my $total = $stat->{len}) { 
> - my $transferred = $stat->{offset} || 0; 
> - my $remaining = $total - $transferred; 
> - my $percent = sprintf "%.2f", ($transferred * 100 / $total); 
> + my $readycounter = 0; 
> 
> - print "transferred: $transferred bytes remaining: $remaining bytes total: $total bytes progression: $percent % busy: $busy ready: $ready \n"; 
> - } 
> + foreach my $job (keys %$jobs) { 
> + 
> + if(defined($jobs->{$job}->{complete}) && !defined($running_mirror_jobs->{$job})) { 
> + print "$job : finished\n"; 
> + delete $jobs->{$job}; 
> + next; 
> + } 
> + 
> + die "$job: mirroring has been cancelled. Maybe do you have bad sectors?" if !defined($running_mirror_jobs->{$job}); 
> 
> + my $busy = $running_mirror_jobs->{$job}->{busy}; 
> + my $ready = $running_mirror_jobs->{$job}->{ready}; 
> + if (my $total = $running_mirror_jobs->{$job}->{len}) { 
> + my $transferred = $running_mirror_jobs->{$job}->{offset} || 0; 
> + my $remaining = $total - $transferred; 
> + my $percent = sprintf "%.2f", ($transferred * 100 / $total); 
> 
> - if ($stat->{ready} eq 'true') { 
> + print "$job: transferred: $transferred bytes remaining: $remaining bytes total: $total bytes progression: $percent % busy: $busy ready: $ready \n"; 
> + } 
> 
> - last if $vmiddst != $vmid; 
> + $readycounter++ if $running_mirror_jobs->{$job}->{ready} eq 'true'; 
> + } 
> 
> - # try to switch the disk if source and destination are on the same guest 
> - eval { vm_mon_cmd($vmid, "block-job-complete", device => "drive-$drive") }; 
> - if (!$@) { 
> - &$finish_job(); 
> + last if scalar(keys %$jobs) == 0; 
> + 
> + if ($readycounter == scalar(keys %$jobs)) { 
> + print "all mirroring jobs are ready \n"; 
> + last if $skipcomplete; #do the complete later 
> + 
> + if ($vmiddst && $vmiddst != $vmid) { 
> + # if we clone a disk for a new target vm, we don't switch the disk 
> + PVE::QemuServer::qemu_blockjobs_cancel($vmid, $jobs); 
> last; 
> + } else { 
> + 
> + foreach my $job (keys %$jobs) { 
> + # try to switch the disk if source and destination are on the same guest 
> + print "$job : Try to complete block job\n"; 
> + 
> + eval { vm_mon_cmd($vmid, "block-job-complete", device => $job) }; 
> + if ($@ =~ m/cannot be completed/) { 
> + print "$job : block job cannot be complete. Try again \n"; 
> + $err_complete++; 
> + }else { 
> + print "$job : complete ok : flushing pending writes\n"; 
> + $jobs->{$job}->{complete} = 1; 
> + } 
> + } 
> } 
> - die $@ if $@ !~ m/cannot be completed/; 
> } 
> sleep 1; 
> } 
> - 
> - 
> }; 
> my $err = $@; 
> 
> - my $cancel_job = sub { 
> - vm_mon_cmd($vmid, "block-job-cancel", device => "drive-$drive"); 
> - &$finish_job(); 
> - }; 
> - 
> if ($err) { 
> - eval { &$cancel_job(); }; 
> + eval { PVE::QemuServer::qemu_blockjobs_cancel($vmid, $jobs) }; 
> die "mirroring error: $err"; 
> } 
> 
> - if ($vmiddst != $vmid) { 
> - # if we clone a disk for a new target vm, we don't switch the disk 
> - &$cancel_job(); # so we call block-job-cancel 
> +} 
> + 
> +sub qemu_blockjobs_cancel { 
> + my ($vmid, $jobs) = @_; 
> + 
> + foreach my $job (keys %$jobs) { 
> + print "$job: try to cancel block job\n"; 
> + eval { vm_mon_cmd($vmid, "block-job-cancel", device => $job); }; 
> + $jobs->{$job}->{cancel} = 1; 
> + } 
> + 
> + while (1) { 
> + my $stats = vm_mon_cmd($vmid, "query-block-jobs"); 
> + 
> + my $running_jobs = {}; 
> + foreach my $stat (@$stats) { 
> + $running_jobs->{$stat->{device}} = $stat; 
> + } 
> + 
> + foreach my $job (keys %$jobs) { 
> + 
> + if(defined($jobs->{$job}->{cancel}) && !defined($running_jobs->{$job})) { 
> + print "$job : finished\n"; 
> + delete $jobs->{$job}; 
> + } 
> + } 
> + 
> + last if scalar(keys %$jobs) == 0; 
> + 
> + sleep 1; 
> } 
> } 
> 
> sub clone_disk { 
> my ($storecfg, $vmid, $running, $drivename, $drive, $snapname, 
> - $newvmid, $storage, $format, $full, $newvollist) = @_; 
> + $newvmid, $storage, $format, $full, $newvollist, $jobs, $skipcomplete) = @_; 
> 
> my $newvolid; 
> 
> @@ -5917,6 +5991,7 @@ sub clone_disk { 
> $newvolid = PVE::Storage::vdisk_clone($storecfg, $drive->{file}, $newvmid, $snapname); 
> push @$newvollist, $newvolid; 
> } else { 
> + 
> my ($storeid, $volname) = PVE::Storage::parse_volume_id($drive->{file}); 
> $storeid = $storage if $storage; 
> 
> @@ -5949,7 +6024,7 @@ sub clone_disk { 
> if $drive->{iothread}; 
> } 
> 
> - qemu_drive_mirror($vmid, $drivename, $newvolid, $newvmid, $sparseinit); 
> + qemu_drive_mirror($vmid, $drivename, $newvolid, $newvmid, $sparseinit, $jobs, $skipcomplete); 
> } 
> } 
> 
> -- 
> 2.1.4