[pve-devel] [PATCH pve-manager 11/18] pvesr prepare-local-job: new helper
Dietmar Maurer
dietmar at proxmox.com
Mon May 29 12:03:08 CEST 2017
comment inline
> On May 29, 2017 at 8:07 AM Fabian Grünbichler <f.gruenbichler at proxmox.com>
> wrote:
>
>
> On Tue, May 23, 2017 at 09:08:50AM +0200, Dietmar Maurer wrote:
> > Prepare for starting a replication job. This is called on the target
> > node before replication starts. This call is for internal use, and
> > return a JSON object on stdout. The method first test if VM <vmid>
> > reside on the local node. If so, stop immediately. After that the
> > method scans all volume IDs for snapshots, and removes all replications
> > snapshots with timestamps different than <last_sync>. It also removes
> > any unused volumes.
> >
> > Returns a hash with boolean markers for all volumes with existing
> > replication snapshots.
> >
> > Signed-off-by: Dietmar Maurer <dietmar at proxmox.com>
> > ---
> > PVE/CLI/pvesr.pm | 98
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > PVE/Replication.pm | 30 +++++++++++++++++
> > 2 files changed, 127 insertions(+), 1 deletion(-)
> >
> > diff --git a/PVE/CLI/pvesr.pm b/PVE/CLI/pvesr.pm
> > index dee002ee..78d69747 100644
> > --- a/PVE/CLI/pvesr.pm
> > +++ b/PVE/CLI/pvesr.pm
> > @@ -4,6 +4,7 @@ use strict;
> > use warnings;
> > use POSIX qw(strftime);
> > use Time::Local;
> > +use Data::Dumper;
>
> not used?
removed in v2
>
> > use JSON;
> >
> > use PVE::JSONSchema qw(get_standard_option);
> > @@ -13,6 +14,7 @@ use PVE::Tools qw(extract_param);
> > use PVE::SafeSyslog;
> > use PVE::CLIHandler;
> >
> > +use PVE::Cluster;
> > use PVE::Replication;
> > use PVE::API2::ReplicationConfig;
> > use PVE::API2::Replication;
> > @@ -26,6 +28,98 @@ sub setup_environment {
> > }
> >
> > __PACKAGE__->register_method ({
> > + name => 'prepare_local_job',
> > + path => 'prepare_local_job',
> > + method => 'POST',
> > + description => "Prepare for starting a replication job. This is called
> > on the target node before replication starts. This call is for internal use,
> > and return a JSON object on stdout. The method first test if VM <vmid>
> > reside on the local node. If so, stop immediately. After that the method
> > scans all volume IDs for snapshots, and removes all replications snapshots
> > with timestamps different than <last_sync>. It also removes any unused
> > volumes. Returns a hash with boolean markers for all volumes with existing
> > replication snapshots.",
> > + parameters => {
> > + additionalProperties => 0,
> > + properties => {
> > + id => get_standard_option('pve-replication-id'),
> > + vmid => get_standard_option('pve-vmid', { completion =>
> > \&PVE::Cluster::complete_vmid }),
> > + 'extra-args' => get_standard_option('extra-args', {
> > + description => "The list of volume IDs to consider." }),
> > + last_sync => {
> > + description => "Time (UNIX epoch) of last successful sync. If not
> > specified, all replication snapshots gets removed.",
>
> s/gets/get/
fixed in v2
>
> > + type => 'integer',
> > + minimum => 0,
> > + optional => 1,
> > + },
> > + },
> > + },
> > + returns => { type => 'null' },
> > + code => sub {
> > + my ($param) = @_;
> > +
> > + my $jobid = $param->{id};
> > + my $vmid = $param->{vmid};
> > + my $last_sync = $param->{last_sync} // 0;
> > +
> > + my $local_node = PVE::INotify::nodename();
> > +
> > + my $vms = PVE::Cluster::get_vmlist();
> > + die "guest '$vmid' is on local node\n"
> > + if $vms->{ids}->{$vmid} && $vms->{ids}->{$vmid}->{node} eq
> > $local_node;
> > +
> > + my $storecfg = PVE::Storage::config();
> > +
> > + my $dl = PVE::Storage::vdisk_list($storecfg, undef, $vmid);
>
> I really really dislike this. AFAICT it is the only time we do a full
> scan of all storages in the replication code path, instead of iterating
> over the disk referenced in the guest config. this is a potentially very
> expensive operation (e.g., on storages like Ceph or NFS), and should not
> be done on every job run..
I think we should skip shared storages here, and only consider local storages
with replication capabilities?
> > +
> > + my $volids = [];
> > +
> > + die "no volumes specified\n" if !scalar(@{$param->{'extra-args'}});
> > +
> > + foreach my $volid (@{$param->{'extra-args'}}) {
> > +
> > + my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
> > + my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid,
> > $local_node);
> > + die "storage '$storeid' is a shared storage\n" if $scfg->{shared};
> > +
> > + my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg,
> > $volid);
> > + die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
> > + if $vtype ne 'images';
> > + die "volume '$volid' has wrong owner\n"
> > + if !$ownervm || $vmid != $ownervm;
> > +
> > + my $found = 0;
> > + foreach my $info (@{$dl->{$storeid}}) {
> > + if ($info->{volid} eq $volid) {
> > + $found = 1;
> > + last;
> > + }
> > + }
> > +
> > + push @$volids, $volid if $found;
> > + }
> > +
> > + $volids = [ sort @$volids ];
> > +
> > + my $logfunc = sub {
> > + my ($start_time, $msg) = @_;
> > + print STDERR "$msg\n";
> > + };
> > +
> > + # remove stale volumes
> > + foreach my $storeid (keys %$dl) {
> > + my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid,
> > $local_node, 1);
> > + next if !$scfg || $scfg->{shared};
> > + foreach my $info (@{$dl->{$storeid}}) {
> > + my $volid = $info->{volid};
> > + next if grep { $_ eq $volid } @$volids;
> > + $logfunc->(undef, "$jobid: delete stale volume '$volid'");
> > + PVE::Storage::vdisk_free($storecfg, $volid);
>
> and now we not only cleaned up stale volumes which were replicated but
> are no longer there on the source, but also all other orphan disks of
> that guest.. intentionally?
yes
> > + }
> > + }
> > +
> > + my $last_snapshots = PVE::Replication::prepare(
> > + $storecfg, $volids, $jobid, $last_sync, undef, $logfunc);
> > +
> > + print to_json($last_snapshots) . "\n";
> > +
> > + return undef;
> > + }});
> > +
> > +__PACKAGE__->register_method ({
> > name => 'run',
> > path => 'run',
> > method => 'POST',
> > @@ -148,7 +242,7 @@ my $print_job_status = sub {
> > if ($next > $now) {
> > $nextstr = strftime("%Y-%m-%d_%H:%M:%S", localtime($job->{next_sync}));
> > } else {
> > - $nextstr = 'now'
> > + $nextstr = 'now';
>
> should be rebased into the proper commit
fixed in v2
>
> > }
> > }
> >
> > @@ -174,6 +268,8 @@ our $cmddef = {
> > enable => [ __PACKAGE__, 'enable', ['id'], {}],
> > disable => [ __PACKAGE__, 'disable', ['id'], {}],
> >
> > + 'prepare-local-job' => [ __PACKAGE__, 'prepare_local_job', ['id',
> > 'vmid', 'extra-args'], {} ],
> > +
> > run => [ __PACKAGE__ , 'run'],
> > };
> >
> > diff --git a/PVE/Replication.pm b/PVE/Replication.pm
> > index 7bfdeb7d..0a0ed436 100644
> > --- a/PVE/Replication.pm
> > +++ b/PVE/Replication.pm
> > @@ -156,6 +156,36 @@ my $get_next_job = sub {
> > return $jobcfg;
> > };
> >
> > +sub replication_snapshot_name {
> > + my ($jobid, $last_sync) = @_;
> > +
> > + my $prefix = "replicate_${jobid}_";
> > + my $snapname = "$prefix${last_sync}_snap";
>
> I'd prefer ${...}${....} for readability ;)
> > +
> > + wantarray ? ($prefix, $snapname) : $snapname;
> > +}
> > +
> > +sub prepare {
> > + my ($storecfg, $volids, $jobid, $last_sync, $start_time, $logfunc) =
> > @_;
> > +
> > + my ($prefix, $snapname) = replication_snapshot_name($jobid,
> > $last_sync);
> > +
> > + my $last_snapshots = {};
> > + foreach my $volid (@$volids) {
> > + my $list = PVE::Storage::volume_snapshot_list($storecfg, $volid, $prefix);
> > + my $found = 0;
> > + foreach my $snap (@$list) {
> > + if ($snap eq $snapname) {
> > + $last_snapshots->{$volid} = 1;
> > + } else {
> > + $logfunc->($start_time, "$jobid: delete stale snapshot '$snap' on
> > $volid");
> > + PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
> > + }
> > + }
>
> this loop looks dangerous to me as well.. shouldn't we only delete
> intermediate snapshots if we know there is a correct later one? what if
> there is no snapshot "$snapname" (because we are in some kind of
> partially failed but recoverable state)? instead of a simple "sync last
> delta again (manually)" we are now in "full resync" territory..
>
> my $snap;
> foreach $snap (@$volids) {
> last if $snap eq $snapname;
> }
>
> if ($snap eq $snapname) {
> foreach $snap (@$volids) {
> if ($snap ne $snapname) {
> $logfunc->($start_time, "$jobid: delete stale snapshot '$snap' on
> $volid");
> PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
> }
> }
> } else {
> # big fat warning
> }
>
IMHO this is not really dangerous.
More information about the pve-devel
mailing list