[pve-devel] [RFC v3 pve-ha-manager 3/8] lrm: implement different behavior for shutdown and restart

Wed Dec 16 09:37:50 CET 2015

On 12/16/2015 09:04 AM, Dietmar Maurer wrote:
> We want to keep resources running at service restart (freeze), but
> want to stop them at them at shutdown.
>
> Actual stoping service will follow in the next patch.
>
> Signed-off-by: Dietmar Maurer <dietmar at proxmox.com>
> ---
>   src/PVE/HA/LRM.pm                | 32 +++++++++++++++++++++++---------
>   src/PVE/HA/Sim/TestHardware.pm   |  6 ++----
>   src/test/test-reboot1/log.expect |  2 +-
>   3 files changed, 26 insertions(+), 14 deletions(-)
>
> diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
> index 89177ee..f428465 100644
> --- a/src/PVE/HA/LRM.pm
> +++ b/src/PVE/HA/LRM.pm
> @@ -43,9 +43,19 @@ sub new {
>   sub shutdown_request {
>       my ($self) = @_;
>   
> -    $self->{shutdown_request} = 1;
> +    my $haenv = $self->{haenv};
> +
> +    my $shutdown = $haenv->is_poweroff();
> +
> +    if ($shutdown) {
> +	$haenv->log('info', "shutdown LRM, stop all services");
> +	$self->{mode} = 'shutdown';
> +    } else {
> +	$haenv->log('info', "restart LRM, freeze all services");
> +	$self->{mode} = 'restart';
> +    }
>   
> -    $self->{mode} = 'restart'; # fixme: detect shutdown/reboot
> +    $self->{shutdown_request} = 1;
>   
>       eval { $self->update_lrm_status(); };
>       if (my $err = $@) {
> @@ -241,17 +251,21 @@ sub do_one_iteration {
>   
>   	    if ($self->{shutdown_request}) {
>   
> -		# fixme: request service stop or relocate ?
> +		if ($self->{mode} eq 'restart') {
>   
> -		my $service_count = $self->active_service_count();
> +		    my $service_count = $self->active_service_count();
>   
> -		if ($service_count == 0) {
> +		    if ($service_count == 0) {
>   
> -		    if ($self->{ha_agent_wd}) {
> -			$haenv->watchdog_close($self->{ha_agent_wd});
> -			delete $self->{ha_agent_wd};
> -		    }
> +			if ($self->{ha_agent_wd}) {
> +			    $haenv->watchdog_close($self->{ha_agent_wd});
> +			    delete $self->{ha_agent_wd};
> +			}

Do we really want to reset the watchdog only on a node restart?
What if we gracefully shutdown the node and need over 60s to stop all 
services (after we implement that) then the watchdog will reset the node 
previously - and thus kill all remaining VMs, AND restart also the node 
which would be undesired on a shutdown.

I would propose that we close the watchdog on (restart AND service_count 
== 0) OR (shutdown).
>   
> +			$shutdown = 1;
> +		    }
> +		} else {
> +		    # fixme: stop all services
>   		    $shutdown = 1;
>   		}
>   	    } else {
> diff --git a/src/PVE/HA/Sim/TestHardware.pm b/src/PVE/HA/Sim/TestHardware.pm
> index 06a3208..3a06906 100644
> --- a/src/PVE/HA/Sim/TestHardware.pm
> +++ b/src/PVE/HA/Sim/TestHardware.pm
> @@ -138,10 +138,8 @@ sub sim_hardware_cmd {
>   
>   	    $self->write_hardware_status_nolock($cstatus);
>   
> -	    if ($d->{lrm}) {
> -		$d->{lrm_env}->log('info', "got shutdown request");
> -		$d->{lrm}->shutdown_request();
> -	    }
> +	    $d->{lrm}->shutdown_request() if $d->{lrm};
> +
>   	} else {
>   	    die "sim_hardware_cmd: unknown command '$cmdstr'\n";
>   	}
> diff --git a/src/test/test-reboot1/log.expect b/src/test/test-reboot1/log.expect
> index d9d8771..43e7d0f 100644
> --- a/src/test/test-reboot1/log.expect
> +++ b/src/test/test-reboot1/log.expect
> @@ -21,7 +21,7 @@ info     25    node3/lrm: status change wait_for_agent_lock => active
>   info     25    node3/lrm: starting service vm:103
>   info     25    node3/lrm: service status vm:103 started
>   info    120      cmdlist: execute reboot node3
> -info    120    node3/lrm: got shutdown request
> +info    120    node3/lrm: restart LRM, freeze all services
>   info    120    node1/crm: service 'vm:103': state changed from 'started' to 'freeze'
>   info    126    node3/lrm: exit (loop end)
>   info    126       reboot: execute power node3 off