[pve-devel] [PATCH ha-manager 2/4] cleanup service from old locks after recovery (fixes #1100)

Thomas Lamprecht t.lamprecht at proxmox.com
Fri Sep 9 16:22:01 CEST 2016


An additional note to this problem:

Here there may be still some half generated files left from the 
interrupted backup.
While we log that we removed a lock in the journal this may be overseen 
easily.
An idea from my side would be creating an recovery report after a node 
failure,
where the recovered service are stated, together with the information if 
there were any cleanup operations made.
So that the user/admin can cleanup the half made backups itself and 
retry the backup.

automagical cleaning such things up isn't the quite the job from the HA 
and also not trivially possible, imo.


On 09/09/2016 04:15 PM, Thomas Lamprecht wrote:
> This cleans up all lock after a service could be recovered from a
> node, else it may not start if its old node failed during the time
> when the service was locked, e.g. by backup/snapshot, and ast the
> action which the lock protects was killed the purpose of the lock
> is gone.
>
> Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
> ---
>   src/PVE/HA/Env/PVE2.pm                   |  2 ++
>   src/PVE/HA/Resources.pm                  |  9 +++++++
>   src/PVE/HA/Resources/PVECT.pm            | 13 ++++++++++
>   src/PVE/HA/Resources/PVEVM.pm            | 13 ++++++++++
>   src/PVE/HA/Sim/Env.pm                    |  9 ++++++-
>   src/PVE/HA/Sim/Resources.pm              | 16 ++++++++++++
>   src/test/test-locked-service1/log.expect | 44 ++++++++++++++++++++++++++++++++
>   7 files changed, 105 insertions(+), 1 deletion(-)
>   create mode 100644 src/test/test-locked-service1/log.expect
>
> diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
> index ef6485d..6ed7c29 100644
> --- a/src/PVE/HA/Env/PVE2.pm
> +++ b/src/PVE/HA/Env/PVE2.pm
> @@ -184,6 +184,8 @@ sub steal_service {
>   	my $new = $plugin->config_file($name, $new_node);
>   	rename($old, $new) ||
>   	    die "rename '$old' to '$new' failed - $!\n";
> +
> +	$plugin->after_recovery_cleanup($self, $name);
>       } else {
>   	die "implement me";
>       }
> diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
> index 3836fc8..365d31e 100644
> --- a/src/PVE/HA/Resources.pm
> +++ b/src/PVE/HA/Resources.pm
> @@ -124,6 +124,15 @@ sub check_running {
>       die "implement in subclass";
>   }
>   
> +# for cleaning up wrong states leftover from not finished actions on the failed
> +# node, e.g. the lock from a service backup
> +sub after_recovery_cleanup {
> +    my ($self, $haenv, $id) = @_;
> +
> +    die "implement in subclass";
> +}
> +
> +
>   
>   # package PVE::HA::Resources::IPAddr;
>   
> diff --git a/src/PVE/HA/Resources/PVECT.pm b/src/PVE/HA/Resources/PVECT.pm
> index 0b44c70..fe9e522 100644
> --- a/src/PVE/HA/Resources/PVECT.pm
> +++ b/src/PVE/HA/Resources/PVECT.pm
> @@ -113,4 +113,17 @@ sub check_running {
>       return PVE::LXC::check_running($vmid);
>   }
>   
> +sub after_recovery_cleanup {
> +    my ($self, $haenv, $id) = @_;
> +
> +    # remove all config locks as any left overs were rendered unnecessary when the
> +    # old node of the service was fenced
> +    if (my $removed_lock = PVE::LXC::Config->remove_lock($id)) {
> +	$haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
> +	            "service 'ct:$id' to allow its start.");
> +    }
> +
> +    # TODO: anything else?
> +}
> +
>   1;
> diff --git a/src/PVE/HA/Resources/PVEVM.pm b/src/PVE/HA/Resources/PVEVM.pm
> index 4c06df9..d996940 100644
> --- a/src/PVE/HA/Resources/PVEVM.pm
> +++ b/src/PVE/HA/Resources/PVEVM.pm
> @@ -116,4 +116,17 @@ sub check_running {
>       return PVE::QemuServer::check_running($vmid, 1, $nodename);
>   }
>   
> +sub after_recovery_cleanup {
> +    my ($self, $haenv, $id) = @_;
> +
> +    # remove all config locks as any left overs were rendered unnecessary when the
> +    # old node of the service was fenced
> +    if (my $removed_lock = PVE::QemuConfig->remove_lock($id)) {
> +	$haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
> +	            "service 'vm:$id' to allow its start.");
> +    }
> +
> +    # TODO: anything else?
> +}
> +
>   1;
> diff --git a/src/PVE/HA/Sim/Env.pm b/src/PVE/HA/Sim/Env.pm
> index cd1574c..c91f67f 100644
> --- a/src/PVE/HA/Sim/Env.pm
> +++ b/src/PVE/HA/Sim/Env.pm
> @@ -199,7 +199,14 @@ sub read_group_config {
>   sub steal_service {
>       my ($self, $sid, $current_node, $new_node) = @_;
>   
> -    return $self->{hardware}->change_service_location($sid, $current_node, $new_node);
> +    $self->{hardware}->change_service_location($sid, $current_node, $new_node);
> +
> +    my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
> +    if (my $plugin = PVE::HA::Sim::Resources->lookup($type)) {
> +	$plugin->after_recovery_cleanup($self, $name);
> +    } else {
> +	die "implement me";
> +    }
>   }
>   
>   sub queue_crm_commands {
> diff --git a/src/PVE/HA/Sim/Resources.pm b/src/PVE/HA/Sim/Resources.pm
> index ec3d775..e664821 100644
> --- a/src/PVE/HA/Sim/Resources.pm
> +++ b/src/PVE/HA/Sim/Resources.pm
> @@ -113,5 +113,21 @@ sub migrate {
>       return defined($ss->{$sid}) ? 0 : 1;
>   }
>   
> +sub after_recovery_cleanup {
> +    my ($self, $haenv, $id) = @_;
> +
> +    my $sid = $self->type() . ":$id";
> +    my $hardware = $haenv->hardware();
> +
> +    # remove all config locks as any left overs were rendered unnecessary when the
> +    # old node of the service was fenced
> +    if (my $removed_lock = $hardware->unlock_service($sid)) {
> +	$haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
> +	            "service '$sid' to allow its start.");
> +    }
> +
> +    # TODO: anything else?
> +}
> +
>   
>   1;
> diff --git a/src/test/test-locked-service1/log.expect b/src/test/test-locked-service1/log.expect
> new file mode 100644
> index 0000000..42eed65
> --- /dev/null
> +++ b/src/test/test-locked-service1/log.expect
> @@ -0,0 +1,44 @@
> +info      0     hardware: starting simulation
> +info     20      cmdlist: execute power node1 on
> +info     20    node1/crm: status change startup => wait_for_quorum
> +info     20    node1/lrm: status change startup => wait_for_agent_lock
> +info     20      cmdlist: execute power node2 on
> +info     20    node2/crm: status change startup => wait_for_quorum
> +info     20    node2/lrm: status change startup => wait_for_agent_lock
> +info     20      cmdlist: execute power node3 on
> +info     20    node3/crm: status change startup => wait_for_quorum
> +info     20    node3/lrm: status change startup => wait_for_agent_lock
> +info     20    node1/crm: got lock 'ha_manager_lock'
> +info     20    node1/crm: status change wait_for_quorum => master
> +info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
> +info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
> +info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
> +info     20    node1/crm: adding new service 'vm:103' on node 'node3'
> +info     22    node2/crm: status change wait_for_quorum => slave
> +info     24    node3/crm: status change wait_for_quorum => slave
> +info     25    node3/lrm: got lock 'ha_agent_node3_lock'
> +info     25    node3/lrm: status change wait_for_agent_lock => active
> +info     25    node3/lrm: starting service vm:103
> +info     25    node3/lrm: service status vm:103 started
> +info    120      cmdlist: execute service vm:103 lock
> +info    220      cmdlist: execute network node3 off
> +info    220    node1/crm: node 'node3': state changed from 'online' => 'unknown'
> +info    224    node3/crm: status change slave => wait_for_quorum
> +info    225    node3/lrm: status change active => lost_agent_lock
> +info    260    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
> +info    260    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
> +info    266     watchdog: execute power node3 off
> +info    265    node3/crm: killed by poweroff
> +info    266    node3/lrm: killed by poweroff
> +info    266     hardware: server 'node3' stopped by poweroff (watchdog)
> +info    340    node1/crm: got lock 'ha_agent_node3_lock'
> +info    340    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
> +info    340    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
> +info    340    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
> +warn    340    node1/crm: removed leftover lock 'virtual' from recovered service 'vm:103' to allow its start.
> +info    340    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node1)
> +info    341    node1/lrm: got lock 'ha_agent_node1_lock'
> +info    341    node1/lrm: status change wait_for_agent_lock => active
> +info    341    node1/lrm: starting service vm:103
> +info    341    node1/lrm: service status vm:103 started
> +info    820     hardware: exit simulation - done





More information about the pve-devel mailing list