[pve-devel] [PATCH ha-manager 1/3] improve relocation policy code in manager and LRM

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Feb 10 14:13:42 CET 2016


Else a few branches would not be taken and the behaviour wasn't
quite straightforward.

Only increment tries if we really retry and log retries

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/LRM.pm     |  5 ++++-
 src/PVE/HA/Manager.pm | 22 ++++++++++++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index d7b54da..2692ca8 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -588,7 +588,6 @@ sub handle_service_exitcode {
 
 	    $tries->{$sid} = 0 if !defined($tries->{$sid});
 
-	    $tries->{$sid}++;
 	    if ($tries->{$sid} >= $max_restart) {
 		$haenv->log('err', "unable to start service $sid on local node".
 			   " after $tries->{$sid} retries");
@@ -596,6 +595,10 @@ sub handle_service_exitcode {
 		return ERROR;
 	    }
 
+	    $tries->{$sid}++;
+
+	    $haenv->log('warning', "restart policy: retry number $tries->{$sid}" .
+			" for service '$sid'");
 	    # tell CRM that we retry the start
 	    return ETRY_AGAIN;
 	}
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 48826e7..21a34dd 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -556,14 +556,25 @@ sub next_state_started {
 
 	    my $try_next = 0;
 	    if ($lrm_res) {
-		if ($lrm_res->{exit_code} == ERROR) {
+		my $ec = $lrm_res->{exit_code};
+		if ($ec == SUCCESS) {
+
+		    $master_status->{relocate_trial}->{$sid} = 0;
+
+		} elsif ($ec == ETRY_AGAIN) {
+
+		    # do nothing, the LRM wants to try again
+
+		} elsif ($ec == ERROR) {
+		    # apply our relocate policy if we got ERROR from the LRM
 
 		    my $try = $master_status->{relocate_trial}->{$sid} || 0;
 
 		    if ($try < $cd->{max_relocate}) {
 
 			$try++;
-			$try_next = 1; # tell select_service_node to relocate
+			# tell select_service_node to relocate if possible
+			$try_next = 1;
 
 			$haenv->log('warning', "starting service $sid on node".
 				   " '$sd->{node}' failed, relocating service.");
@@ -577,8 +588,11 @@ sub next_state_started {
 			return;
 
 		    }
-		} elsif ($lrm_res->{exit_code} == SUCCESS) {
-		    $master_status->{relocate_trial}->{$sid} = 0;
+		} else {
+		    $haenv->log('err', "service '$sid' got unrecoverable error" .
+				" (exit code $ec))");
+		    # we have no save way out (yet) for other errors
+		    &$change_service_state($self, $sid, 'error');
 		}
 	    }
 
-- 
2.1.4





More information about the pve-devel mailing list