[pve-devel] [RFC v2 pve-ha-manager 1/3] implement recovery policy for services

Thomas Lamprecht t.lamprecht at proxmox.com
Tue Sep 15 14:18:50 CEST 2015


We implement recovery policies which use settings known from
rgmanager, however the behaviour is not strictly the same.

There are the following policy settings which kick in on an failed
service start:
* max_restart:  maxmial number of tries to restart an failed service
                on the actual node. The default is 1 restart try.
                This policy gets enforced by the LRM.

* max_relocate: maximal number of tries to relocate the service to a
                a different node. A relocate only takes place after
		the max_restart value is exceeded on the actual node
		This policy gets enforced by the CRM.

If a service is still no running after all max tries, it's state
gets set to 'error'. This means that the service needs to be checked
and disabled manually.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Env/PVE2.pm  |  4 ++--
 src/PVE/HA/LRM.pm       | 40 ++++++++++++++++++++++++++++++++++++++++
 src/PVE/HA/Manager.pm   | 29 +++++++++++++++++++++++++++--
 src/PVE/HA/Resources.pm | 20 ++++++++++++++++++++
 4 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index d508922..5d126b0 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -97,6 +97,8 @@ sub read_service_config {
 	my $d = $res->{ids}->{$sid};
 	my (undef, undef, $name) = PVE::HA::Tools::parse_sid($sid);
 	$d->{state} = 'enabled' if !defined($d->{state});
+	$d->{max_restart} = 1 if !defined($d->{max_restart});
+	$d->{max_relocate} = 1 if !defined($d->{max_relocate});
 	if (PVE::HA::Resources->lookup($d->{type})) {
 	    if (my $vmd = $vmlist->{ids}->{$name}) {
 		if (!$vmd) {
@@ -392,8 +394,6 @@ sub exec_resource_agent {
 
     if ($cmd eq 'started') {
 
-	# fixme: count failures
-	
 	return 0 if $running;
 
 	$self->log("info", "starting service $sid");
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index bc8ed52..dc2856b 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -29,6 +29,7 @@ sub new {
 	status => { state => 'startup' },
 	workers => {},
 	results => {},
+	restart_tries => {},
 	shutdown_request => 0,
 	# mode can be: active, reboot, shutdown, restart
 	mode => 'active',
@@ -448,6 +449,8 @@ sub resource_command_finished {
 	$exit_code = ($status >> 8);
     }
 
+    $exit_code = $self->handle_service_exitcode($sid, $w->{state}, $exit_code);
+
     $self->{results}->{$uid} = {
 	sid => $w->{sid},
 	state => $w->{state},
@@ -472,4 +475,41 @@ sub resource_command_finished {
     $self->{results} = $results;
 }
 
+sub handle_service_exitcode {
+    my ($self, $sid, $cmd, $exit_code) = @_;
+
+    my $haenv = $self->{haenv};
+    my $tries = $self->{restart_tries};
+
+    my $sc = $haenv->read_service_config();
+    my $cd = $sc->{$sid};
+
+    if ($cmd eq 'started') {
+
+	if ($exit_code == 0) {
+
+	    $tries->{$sid} = 0;
+
+	    return $exit_code;
+
+	} elsif ($exit_code == 1) {
+
+	    $tries->{$sid} = 0 if !defined($tries->{$sid});
+
+	    $tries->{$sid}++;
+	    if ($tries->{$sid} >= $cd->{max_restart}) {
+		$haenv->log('err', "unable to start service $sid on local node".
+			          " after $tries->{$sid} retries");
+		$tries->{$sid} = 0;
+		return 1;
+	    }
+
+	    return 2;
+	}
+    }
+
+    return $exit_code;
+
+}
+
 1;
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 746c1da..ca20c1b 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -517,6 +517,7 @@ sub next_state_started {
     my ($self, $sid, $cd, $sd, $lrm_res) = @_;
 
     my $haenv = $self->{haenv};
+    my $master_status = $self->{ms};
     my $ns = $self->{ns};
 
     if (!$ns->node_is_online($sd->{node})) {
@@ -552,8 +553,32 @@ sub next_state_started {
 	} else {
 
 	    my $try_next = 0;
-	    if ($lrm_res && ($lrm_res->{exit_code} != 0)) { # fixme: other exit codes?
-		$try_next = 1;
+	    if ($lrm_res) {
+		if ($lrm_res->{exit_code} == 1) {
+
+		    my $try = $master_status->{relocate_trial}->{$sid} || 0;
+
+		    $try++;
+		    if($try < $cd->{max_relocate}) {
+
+			$try_next = 1;
+
+			$haenv->log('warning', "starting service $sid on node".
+				   " '$sd->{node}' failed, relocating service.");
+			$master_status->{relocate_trial}->{$sid} = $try;
+
+		    } else {
+
+			$master_status->{relocate_trial}->{$sid} = 0;
+			$haenv->log('err', "recovery policy for service".
+				          " $sid failed, entering error state!");
+			&$change_service_state($self, $sid, 'error');
+			return;
+
+		    }
+		} elsif($lrm_res->{exit_code} == 0) {
+		    $master_status->{relocate_trial}->{$sid} = 0;
+		}
 	    }
 
 	    my $node = select_service_node($self->{groups}, $self->{online_node_usage}, 
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 2bdebb9..4c8cd1b 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -21,6 +21,22 @@ my $defaultData = {
 	    optional => 1,
 	    default => 'enabled',
 	},
+	max_restart => {
+	    description => "Maximal number of tries to restart the service on".
+		          " a node after its start failed.",
+	    type => 'integer',
+	    optional => 1,
+	    default => 1,
+	    minimum => 0,
+	},
+	max_relocate => {
+	    description => "Maximal number of service relocate tries when a".
+		          " service failes to start.",
+	    type => 'integer',
+	    optional => 1,
+	    default => 1,
+	    minimum => 0,
+	},
 	group => get_standard_option('pve-ha-group-id', { optional => 1 }),
 	comment => {
 	    description => "Description.",
@@ -116,6 +132,8 @@ sub options {
 	state => { optional => 1 },
 	group => { optional => 1 },
 	comment => { optional => 1 },
+	max_restart => { optional => 1 },
+	max_relocate => { optional => 1 },
     };
 }
 
@@ -180,6 +198,8 @@ sub options {
 	state => { optional => 1 },
 	group => { optional => 1 },
 	comment => { optional => 1 },
+	max_restart => { optional => 1 },
+	max_relocate => { optional => 1 },
     };
 }
 
-- 
2.1.4




More information about the pve-devel mailing list