[pve-devel] [PATCH pve-manager] replication: improve stale volume detection, allow sync from parent snapshot
Dietmar Maurer
dietmar at proxmox.com
Wed Jun 7 10:27:55 CEST 2017
We pass a list of storage to scan for stale volumes to prepare_local_job().
So we make sure that we only activate/scan related storages.
Snapshot rollback may remove local replication shapshots. In that case
we still have the $conf->{parent} snapshot on both sides, so we
can use that as base snapshot.
Signed-off-by: Dietmar Maurer <dietmar at proxmox.com>
---
PVE/CLI/pvesr.pm | 148 +++++++++++++++++++++++++----------------
PVE/Replication.pm | 62 +++++++++++------
bin/test/replication_test5.log | 14 ++--
bin/test/replication_test5.pl | 4 +-
4 files changed, 138 insertions(+), 90 deletions(-)
diff --git a/PVE/CLI/pvesr.pm b/PVE/CLI/pvesr.pm
index dc367638..4c76b416 100644
--- a/PVE/CLI/pvesr.pm
+++ b/PVE/CLI/pvesr.pm
@@ -25,6 +25,28 @@ sub setup_environment {
PVE::RPCEnvironment->setup_default_cli_env();
}
+# fixme: get from plugin??
+my $replicatable_storage_types = {
+ zfspool => 1,
+};
+
+my $check_wanted_volid = sub {
+ my ($storecfg, $vmid, $volid, $local_node) = @_;
+
+ my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
+ my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
+ die "storage '$storeid' is not replicatable\n"
+ if !$replicatable_storage_types->{$scfg->{type}};
+
+ my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
+ die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
+ if $vtype ne 'images';
+ die "volume '$volid' has wrong owner\n"
+ if !$ownervm || $vmid != $ownervm;
+
+ return $storeid;
+};
+
__PACKAGE__->register_method ({
name => 'prepare_local_job',
path => 'prepare_local_job',
@@ -36,6 +58,11 @@ __PACKAGE__->register_method ({
id => get_standard_option('pve-replication-id'),
'extra-args' => get_standard_option('extra-args', {
description => "The list of volume IDs to consider." }),
+ scan => {
+ description => "List of storage IDs to scan for stale volumes.",
+ type => 'string', format => 'pve-storage-id-list',
+ optional => 1,
+ },
force => {
description => "Allow to remove all existion volumes (empty volume list).",
type => 'boolean',
@@ -48,75 +75,88 @@ __PACKAGE__->register_method ({
minimum => 0,
optional => 1,
},
+ parent_snapname => get_standard_option('pve-snapshot-name', {
+ optional => 1,
+ }),
},
},
returns => { type => 'null' },
code => sub {
my ($param) = @_;
- my ($vmid, undef, $jobid) = PVE::ReplicationConfig::parse_replication_job_id($param->{id});
- my $last_sync = $param->{last_sync} // 0;
+ my $logfunc = sub {
+ my ($msg) = @_;
+ print STDERR "$msg\n";
+ };
my $local_node = PVE::INotify::nodename();
+ die "no volumes specified\n"
+ if !$param->{force} && !scalar(@{$param->{'extra-args'}});
+
+ my ($vmid, undef, $jobid) = PVE::ReplicationConfig::parse_replication_job_id($param->{id});
+
my $vms = PVE::Cluster::get_vmlist();
die "guest '$vmid' is on local node\n"
if $vms->{ids}->{$vmid} && $vms->{ids}->{$vmid}->{node} eq $local_node;
- my $storecfg = PVE::Storage::config();
-
- my $dl = PVE::Storage::vdisk_list($storecfg, undef, $vmid);
+ my $last_sync = $param->{last_sync} // 0;
+ my $parent_snapname = $param->{parent_snapname};
- my $volids = [];
+ my $storecfg = PVE::Storage::config();
- die "no volumes specified\n"
- if !$param->{force} && !scalar(@{$param->{'extra-args'}});
+ # compute list of storages we want to scan
+ my $storage_hash = {};
+ foreach my $storeid (PVE::Tools::split_list($param->{scan})) {
+ my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node, 1);
+ next if !$scfg; # simply ignore unavailable storages here
+ die "storage '$storeid' is not replicatable\n" if !$replicatable_storage_types->{$scfg->{type}};
+ $storage_hash->{$storeid} = 1;
+ }
+ my $wanted_volids = {};
foreach my $volid (@{$param->{'extra-args'}}) {
-
- my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
- my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
- die "storage '$storeid' is a shared storage\n" if $scfg->{shared};
-
- my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
- die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
- if $vtype ne 'images';
- die "volume '$volid' has wrong owner\n"
- if !$ownervm || $vmid != $ownervm;
-
- my $found = 0;
- foreach my $info (@{$dl->{$storeid}}) {
- if ($info->{volid} eq $volid) {
- $found = 1;
- last;
- }
- }
-
- push @$volids, $volid if $found;
+ my $storeid = $check_wanted_volid->($storecfg, $vmid, $volid, $local_node);
+ $wanted_volids->{$volid} = 1;
+ $storage_hash->{$storeid} = 1;
}
-
- $volids = [ sort @$volids ];
-
- my $logfunc = sub {
- my ($msg) = @_;
- print STDERR "$msg\n";
- };
-
- # remove stale volumes
- foreach my $storeid (keys %$dl) {
- my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node, 1);
- next if !$scfg || $scfg->{shared};
- foreach my $info (@{$dl->{$storeid}}) {
- my $volid = $info->{volid};
- next if grep { $_ eq $volid } @$volids;
- $logfunc->("$jobid: delete stale volume '$volid'");
- PVE::Storage::vdisk_free($storecfg, $volid);
+ my $storage_list = [ sort keys %$storage_hash ];
+
+ # activate all used storage
+ my $cache = {};
+ PVE::Storage::activate_storage_list($storecfg, $storage_list, $cache);
+
+ my $snapname = PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
+
+ # find replication snapshots
+ my $last_snapshots = {};
+ foreach my $storeid (@$storage_list) {
+ my $scfg = PVE::Storage::storage_config($storecfg, $storeid);
+ my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
+ my $volids = $plugin->list_images($storeid, $scfg, $vmid, undef, $cache);
+ foreach my $volid (@$volids) {
+ my ($storeid, $volname) = parse_volume_id($volid);
+ my $list = $plugin->volume_snapshot_list($scfg, $storeid, $volname); # fixme: pass $cache
+ my $found_replication_snapshots = 0;
+ foreach my $snap (@$list) {
+ if ($snap eq $snapname || (defined($parent_snapname) && ($snap eq $parent_snapname))) {
+ $last_snapshots->{$volid}->{$snap} = 1 if $wanted_volids->{$volid};
+ } elsif ($snap =~ m/^__replication_/) {
+ $found_replication_snapshots = 1;
+ if ($wanted_volids->{$volid}) {
+ $logfunc->("$jobid: delete stale replication snapshot '$snap' on $volid");
+ PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
+ }
+ }
+ }
+ # remove stale volumes
+ if ($found_replication_snapshots && !$wanted_volids->{$volid}) {
+ $logfunc->("$jobid: delete stale volume '$volid'");
+ PVE::Storage::vdisk_free($storecfg, $volid);
+ }
}
}
- my $last_snapshots = PVE::Replication::prepare(
- $storecfg, $volids, $jobid, $last_sync, undef, $logfunc);
-
print to_json($last_snapshots) . "\n";
return undef;
@@ -161,17 +201,7 @@ __PACKAGE__->register_method ({
die "no volumes specified\n" if !scalar(@{$param->{'extra-args'}});
foreach my $volid (@{$param->{'extra-args'}}) {
-
- my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
- my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
- die "storage '$storeid' is a shared storage\n" if $scfg->{shared};
-
- my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
- die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
- if $vtype ne 'images';
- die "volume '$volid' has wrong owner\n"
- if !$ownervm || $vmid != $ownervm;
-
+ $check_wanted_volid->($storecfg, $vmid, $volid, $local_node);
push @$volids, $volid;
}
diff --git a/PVE/Replication.pm b/PVE/Replication.pm
index a4013fb4..eb2583dc 100644
--- a/PVE/Replication.pm
+++ b/PVE/Replication.pm
@@ -113,13 +113,15 @@ my $get_next_job = sub {
};
sub remote_prepare_local_job {
- my ($ssh_info, $jobid, $vmid, $volumes, $last_sync, $force) = @_;
+ my ($ssh_info, $jobid, $vmid, $volumes, $storeid_list, $last_sync, $parent_snapname, $force) = @_;
my $ssh_cmd = PVE::Cluster::ssh_info_to_command($ssh_info);
my $cmd = [@$ssh_cmd, '--', 'pvesr', 'prepare-local-job', $jobid, $vmid];
+ push @$cmd, '--scan', join(',', @$storeid_list) if scalar(@$storeid_list);
push @$cmd, @$volumes if scalar(@$volumes);
push @$cmd, '--last_sync', $last_sync;
+ push @$cmd, '--parent_snapname', $parent_snapname;
push @$cmd, '--force' if $force;
my $remote_snapshots;
@@ -147,21 +149,25 @@ sub remote_finalize_local_job {
PVE::Tools::run_command($cmd);
}
+# finds local replication snapshots from $last_sync
+# and removes all replication snapshots with other time stamps
sub prepare {
- my ($storecfg, $volids, $jobid, $last_sync, $start_time, $logfunc) = @_;
+ my ($storecfg, $volids, $jobid, $last_sync, $parent_snapname, $logfunc) = @_;
+
+ $last_sync //= 0;
my ($prefix, $snapname) =
PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
my $last_snapshots = {};
foreach my $volid (@$volids) {
- my $list = PVE::Storage::volume_snapshot_list($storecfg, $volid, $prefix);
+ my $list = PVE::Storage::volume_snapshot_list($storecfg, $volid);
my $found = 0;
foreach my $snap (@$list) {
- if ($snap eq $snapname) {
- $last_snapshots->{$volid} = 1;
- } else {
- $logfunc->("$jobid: delete stale snapshot '$snap' on $volid");
+ if ($snap eq $snapname || (defined($parent_snapname) && ($snap eq $parent_snapname))) {
+ $last_snapshots->{$volid}->{$snap} = 1;
+ } elsif ($snap =~ m/^\Q$prefix\E/) {
+ $logfunc->("$jobid: delete stale replication snapshot '$snap' on $volid");
PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
}
}
@@ -251,11 +257,11 @@ sub replicate {
if ($remove_job eq 'full' && $jobcfg->{target} ne $local_node) {
# remove all remote volumes
my $ssh_info = PVE::Cluster::get_ssh_info($jobcfg->{target});
- remote_prepare_local_job($ssh_info, $jobid, $vmid, [], 0, 1);
+ remote_prepare_local_job($ssh_info, $jobid, $vmid, [], $state->{storeid_list}, 0, undef, 1);
}
# remove all local replication snapshots (lastsync => 0)
- prepare($storecfg, $sorted_volids, $jobid, 0, $start_time, $logfunc);
+ prepare($storecfg, $sorted_volids, $jobid, 0, undef, $logfunc);
delete_job($jobid); # update config
$logfunc->("$jobid: job removed");
@@ -265,19 +271,22 @@ sub replicate {
my $ssh_info = PVE::Cluster::get_ssh_info($jobcfg->{target}, $migration_network);
- # prepare remote side
- my $remote_snapshots = remote_prepare_local_job(
- $ssh_info, $jobid, $vmid, $sorted_volids, $last_sync);
-
- # test if we have a replication_ snapshot from last sync
- # and remove all other/stale replication snapshots
my $last_sync_snapname =
PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
my $sync_snapname =
PVE::ReplicationState::replication_snapshot_name($jobid, $start_time);
+ my $parent_snapname = $conf->{parent};
+
+ # test if we have a replication_ snapshot from last sync
+ # and remove all other/stale replication snapshots
+
my $last_snapshots = prepare(
- $storecfg, $sorted_volids, $jobid, $last_sync, $start_time, $logfunc);
+ $storecfg, $sorted_volids, $jobid, $last_sync, $parent_snapname, $logfunc);
+
+ # prepare remote side
+ my $remote_snapshots = remote_prepare_local_job(
+ $ssh_info, $jobid, $vmid, $sorted_volids, $state->{storeid_list}, $last_sync, $parent_snapname, 0);
my $storeid_hash = {};
foreach my $volid (@$sorted_volids) {
@@ -313,7 +322,7 @@ sub replicate {
my $cleanup_local_snapshots = sub {
my ($volid_hash, $snapname) = @_;
foreach my $volid (sort keys %$volid_hash) {
- $logfunc->("$jobid: delete snapshot '$snapname' on $volid");
+ $logfunc->("$jobid: delete previous replication snapshot '$snapname' on $volid");
eval { PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snapname, $running); };
warn $@ if $@;
}
@@ -331,12 +340,21 @@ sub replicate {
foreach my $volid (@$sorted_volids) {
my $base_snapname;
- if ($last_snapshots->{$volid} && $remote_snapshots->{$volid}) {
- $logfunc->("$jobid: incremental sync '$volid' ($last_sync_snapname => $sync_snapname)");
- $base_snapname = $last_sync_snapname;
- } else {
- $logfunc->("$jobid: full sync '$volid' ($sync_snapname)");
+
+ if (defined($last_snapshots->{$volid}) && defined($remote_snapshots->{$volid})) {
+ if ($last_snapshots->{$volid}->{$last_sync_snapname} &&
+ $remote_snapshots->{$volid}->{$last_sync_snapname}) {
+ $logfunc->("$jobid: incremental sync '$volid' ($last_sync_snapname => $sync_snapname)");
+ $base_snapname = $last_sync_snapname;
+ } elsif (defined($parent_snapname) &&
+ ($last_snapshots->{$volid}->{$parent_snapname} &&
+ $remote_snapshots->{$volid}->{$parent_snapname})) {
+ $logfunc->("$jobid: incremental sync '$volid' ($parent_snapname => $sync_snapname)");
+ $base_snapname = $parent_snapname;
+ }
}
+
+ $logfunc->("$jobid: full sync '$volid' ($sync_snapname)") if !defined($base_snapname);
replicate_volume($ssh_info, $storecfg, $volid, $base_snapname, $sync_snapname, $rate, $insecure);
}
};
diff --git a/bin/test/replication_test5.log b/bin/test/replication_test5.log
index e70c79ba..b365048e 100644
--- a/bin/test/replication_test5.log
+++ b/bin/test/replication_test5.log
@@ -13,7 +13,7 @@
1840 job_900_to_node2: volumes => local-zfs:vm-900-disk-1
1840 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
1840 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_1000__ => __replicate_job_900_to_node2_1840__)
-1840 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_1000__' on local-zfs:vm-900-disk-1
+1840 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_1000__' on local-zfs:vm-900-disk-1
1840 job_900_to_node2: end replication job
1840 job_900_to_node2: changed config next_sync => 2700
1840 job_900_to_node2: changed state last_try => 1840, last_sync => 1840
@@ -22,7 +22,7 @@
2740 job_900_to_node2: volumes => local-zfs:vm-900-disk-1,local-zfs:vm-900-disk-2
2740 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
2740 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-2
-2740 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
+2740 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
2740 job_900_to_node2: end replication job with error: no such volid 'local-zfs:vm-900-disk-2'
2740 job_900_to_node2: changed config next_sync => 3040
2740 job_900_to_node2: changed state last_try => 2740, fail_count => 1, error => no such volid 'local-zfs:vm-900-disk-2'
@@ -33,7 +33,7 @@
3040 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
3040 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_1840__ => __replicate_job_900_to_node2_3040__)
3040 job_900_to_node2: full sync 'local-zfs:vm-900-disk-2' (__replicate_job_900_to_node2_3040__)
-3040 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
+3040 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
3040 job_900_to_node2: end replication job
3040 job_900_to_node2: changed config next_sync => 3600
3040 job_900_to_node2: changed state last_try => 3040, last_sync => 3040, fail_count => 0, error =>
@@ -44,8 +44,8 @@
3640 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
3640 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_3040__ => __replicate_job_900_to_node2_3640__)
3640 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-2' (__replicate_job_900_to_node2_3040__ => __replicate_job_900_to_node2_3640__)
-3640 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-1
-3640 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
+3640 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-1
+3640 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
3640 job_900_to_node2: end replication job
3640 job_900_to_node2: changed config next_sync => 4500
3640 job_900_to_node2: changed state last_try => 3640, last_sync => 3640
@@ -53,8 +53,8 @@
3700 job_900_to_node2: guest => 900, type => qemu, running => 0
3700 job_900_to_node2: volumes => local-zfs:vm-900-disk-1,local-zfs:vm-900-disk-2
3700 job_900_to_node2: start job removal - mode 'full'
-3700 job_900_to_node2: delete stale snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-1
-3700 job_900_to_node2: delete stale snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
+3700 job_900_to_node2: delete stale replication snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-1
+3700 job_900_to_node2: delete stale replication snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
3700 job_900_to_node2: job removed
3700 job_900_to_node2: end replication job
3700 job_900_to_node2: vanished job
diff --git a/bin/test/replication_test5.pl b/bin/test/replication_test5.pl
index 3db7351d..93afe81f 100755
--- a/bin/test/replication_test5.pl
+++ b/bin/test/replication_test5.pl
@@ -32,7 +32,7 @@ use PVE::Storage;
my $replicated_volume_status = {};
my $mocked_remote_prepare_local_job = sub {
- my ($ssh_info, $jobid, $vmid, $volumes, $last_sync, $force) = @_;
+ my ($ssh_info, $jobid, $vmid, $volumes, $storeid_list, $last_sync, $parent_snapname, $force) = @_;
my $target = $ssh_info->{node};
@@ -49,7 +49,7 @@ my $mocked_remote_prepare_local_job = sub {
}
my $snapname = $replicated_volume_status->{$target}->{$volid};
- $last_snapshots->{$volid} = 1 if $last_sync_snapname eq $snapname;
+ $last_snapshots->{$volid}->{$snapname} = 1 if $last_sync_snapname eq $snapname;
}
return $last_snapshots;
--
2.11.0
More information about the pve-devel
mailing list