[pve-devel] [PATCH pve-ha-manager] limit service start tries

Thomas Lamprecht t.lamprecht at proxmox.com
Mon Sep 7 13:52:16 CEST 2015


Previously we tried to start an service indefinitely often, now we
limit it to 3 start tries. After that the service goes in the error
state and needs to be disabled manually.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Env/PVE2.pm |  2 --
 src/PVE/HA/Manager.pm  | 29 +++++++++++++++++++++++++----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index d508922..e49245a 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -392,8 +392,6 @@ sub exec_resource_agent {
 
     if ($cmd eq 'started') {
 
-	# fixme: count failures
-	
 	return 0 if $running;
 
 	$self->log("info", "starting service $sid");
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 746c1da..405ba1d 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -180,11 +180,12 @@ my $change_service_state = sub {
     my ($self, $sid, $new_state, %params) = @_;
 
     my ($haenv, $ss) = ($self->{haenv}, $self->{ss});
-
+    
     my $sd = $ss->{$sid} || die "no such service '$sid";
 
     my $old_state = $sd->{state};
     my $old_node = $sd->{node};
+    my $old_try = $sd->{start_try};
 
     die "no state change" if $old_state eq $new_state; # just to be sure
 
@@ -194,6 +195,7 @@ my $change_service_state = sub {
 
     $sd->{state} = $new_state;
     $sd->{node} = $old_node;
+    $sd->{start_try} = $old_try;
 
     my $text_state = '';
     foreach my $k (sort keys %params) {
@@ -294,7 +296,7 @@ sub manage {
 	next if $ss->{$sid}; # already there
 	$haenv->log('info', "adding new service '$sid' on node '$sc->{$sid}->{node}'");
 	# assume we are running to avoid relocate running service at add
-	$ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node},
+	$ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node}, start_try => 0,
 			uid => compute_new_uuid('started') };
     }
 
@@ -552,8 +554,27 @@ sub next_state_started {
 	} else {
 
 	    my $try_next = 0;
-	    if ($lrm_res && ($lrm_res->{exit_code} != 0)) { # fixme: other exit codes?
-		$try_next = 1;
+	    if(defined($lrm_res)) {
+		if ($lrm_res->{exit_code} == 0) {
+		    $sd->{start_try} = 0;
+		} else {
+		    $try_next = 1;
+
+		    $sd->{start_try} = 0 if !defined($sd->{start_trial});
+
+		    my $max_start_tries = 3; # fixme: make configurable
+
+		    if($sd->{start_try} >= $max_start_tries) {
+			$haenv->log('err', "service $sid couldn't be started after " .
+				   "$max_start_tries tries and needs manual intervention");
+			$sd->{trial} = 0;
+			&$change_service_state($self, $sid, 'error');
+			return;
+		    }
+
+		    $sd->{start_try}++;
+
+		}
 	    }
 
 	    my $node = select_service_node($self->{groups}, $self->{online_node_usage}, 
-- 
2.1.4




More information about the pve-devel mailing list