[pve-devel] [PATCH ha-manager 1/2] Manager: record tried node on relocation policy

Thu May 19 15:08:16 CEST 2016

Instead of simply counting up an integer on each failed relocation
trial record the already tried nodes. We still have the try count
through the size of the array, so no information lost and no
behavioural change.

Use this for now to log on which nodes we failed to recover, may be
useful for an user to see that those node fails, so that he can
investigate for which reason and fix those.

Further this prepares us for a more intelligent recovery node
selection, as we can skip already tried nodes from the current
recovery cycle.

With the reuse of the relocate_trials to relocate_tried_nodes this
can happen without any overhead (i.e. additional hash) in the
manager status.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Manager.pm                      | 21 +++++++++++++--------
 src/test/test-resource-failure2/log.expect |  1 +
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 622ece8..e13c782 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -349,8 +349,8 @@ sub manage {
     }
 
     # remove stale relocation try entries
-    foreach my $sid (keys %{$ms->{relocate_trial}}) {
-	delete $ms->{relocate_trial}->{$sid} if !$ss->{$sid};
+    foreach my $sid (keys %{$ms->{relocate_tried_nodes}}) {
+	delete $ms->{relocate_tried_nodes}->{$sid} if !$ss->{$sid};
     }
 
     $self->update_crm_commands();
@@ -589,26 +589,31 @@ sub next_state_started {
 	} else {
 
 	    my $try_next = 0;
+	    my $tried_nodes = $master_status->{relocate_tried_nodes}->{$sid} || [];
 	    if ($lrm_res) {
 		my $ec = $lrm_res->{exit_code};
 		if ($ec == SUCCESS) {
 
-		    $master_status->{relocate_trial}->{$sid} = 0;
+		    if (@$tried_nodes) {
+			$haenv->log('info', "relocation policy successful for '$sid'," .
+				    " tried nodes: " . join(', ', @$tried_nodes) );
+		    }
+
+		    delete $master_status->{relocate_tried_nodes}->{$sid};
 
 		} elsif ($ec == ERROR) {
 		    # apply our relocate policy if we got ERROR from the LRM
 
-		    my $try = $master_status->{relocate_trial}->{$sid} || 0;
-
-		    if ($try < $cd->{max_relocate}) {
+		    if (scalar @$tried_nodes < $cd->{max_relocate}) {
 
-			$try++;
 			# tell select_service_node to relocate if possible
 			$try_next = 1;
+			# add current service  node to failed list
+			push @$tried_nodes, $sd->{node};
+			$master_status->{relocate_tried_nodes}->{$sid} = $tried_nodes;
 
 			$haenv->log('warning', "starting service $sid on node".
 				   " '$sd->{node}' failed, relocating service.");
-			$master_status->{relocate_trial}->{$sid} = $try;
 
 		    } else {
 
diff --git a/src/test/test-resource-failure2/log.expect b/src/test/test-resource-failure2/log.expect
index 604ad95..37cc461 100644
--- a/src/test/test-resource-failure2/log.expect
+++ b/src/test/test-resource-failure2/log.expect
@@ -41,4 +41,5 @@ info    201    node1/lrm: got lock 'ha_agent_node1_lock'
 info    201    node1/lrm: status change wait_for_agent_lock => active
 info    201    node1/lrm: starting service fa:130
 info    201    node1/lrm: service status fa:130 started
+info    220    node1/crm: relocation policy successful for 'fa:130', tried nodes: node2
 info    720     hardware: exit simulation - done
-- 
2.1.4