[pve-devel] [PATCH ha-manager 1/3] improve relocation policy code in manager and LRM
Thomas Lamprecht
t.lamprecht at proxmox.com
Wed Feb 10 14:13:42 CET 2016
Else a few branches would not be taken and the behaviour wasn't
quite straightforward.
Only increment tries if we really retry and log retries
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/LRM.pm | 5 ++++-
src/PVE/HA/Manager.pm | 22 ++++++++++++++++++----
2 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index d7b54da..2692ca8 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -588,7 +588,6 @@ sub handle_service_exitcode {
$tries->{$sid} = 0 if !defined($tries->{$sid});
- $tries->{$sid}++;
if ($tries->{$sid} >= $max_restart) {
$haenv->log('err', "unable to start service $sid on local node".
" after $tries->{$sid} retries");
@@ -596,6 +595,10 @@ sub handle_service_exitcode {
return ERROR;
}
+ $tries->{$sid}++;
+
+ $haenv->log('warning', "restart policy: retry number $tries->{$sid}" .
+ " for service '$sid'");
# tell CRM that we retry the start
return ETRY_AGAIN;
}
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 48826e7..21a34dd 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -556,14 +556,25 @@ sub next_state_started {
my $try_next = 0;
if ($lrm_res) {
- if ($lrm_res->{exit_code} == ERROR) {
+ my $ec = $lrm_res->{exit_code};
+ if ($ec == SUCCESS) {
+
+ $master_status->{relocate_trial}->{$sid} = 0;
+
+ } elsif ($ec == ETRY_AGAIN) {
+
+ # do nothing, the LRM wants to try again
+
+ } elsif ($ec == ERROR) {
+ # apply our relocate policy if we got ERROR from the LRM
my $try = $master_status->{relocate_trial}->{$sid} || 0;
if ($try < $cd->{max_relocate}) {
$try++;
- $try_next = 1; # tell select_service_node to relocate
+ # tell select_service_node to relocate if possible
+ $try_next = 1;
$haenv->log('warning', "starting service $sid on node".
" '$sd->{node}' failed, relocating service.");
@@ -577,8 +588,11 @@ sub next_state_started {
return;
}
- } elsif ($lrm_res->{exit_code} == SUCCESS) {
- $master_status->{relocate_trial}->{$sid} = 0;
+ } else {
+ $haenv->log('err', "service '$sid' got unrecoverable error" .
+ " (exit code $ec))");
+ # we have no save way out (yet) for other errors
+ &$change_service_state($self, $sid, 'error');
}
}
--
2.1.4
More information about the pve-devel
mailing list