[pve-devel] [PATCH pve-ha-manager] limit service start tries
Thomas Lamprecht
t.lamprecht at proxmox.com
Mon Sep 7 13:52:16 CEST 2015
Previously we tried to start an service indefinitely often, now we
limit it to 3 start tries. After that the service goes in the error
state and needs to be disabled manually.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Env/PVE2.pm | 2 --
src/PVE/HA/Manager.pm | 29 +++++++++++++++++++++++++----
2 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index d508922..e49245a 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -392,8 +392,6 @@ sub exec_resource_agent {
if ($cmd eq 'started') {
- # fixme: count failures
-
return 0 if $running;
$self->log("info", "starting service $sid");
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 746c1da..405ba1d 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -180,11 +180,12 @@ my $change_service_state = sub {
my ($self, $sid, $new_state, %params) = @_;
my ($haenv, $ss) = ($self->{haenv}, $self->{ss});
-
+
my $sd = $ss->{$sid} || die "no such service '$sid";
my $old_state = $sd->{state};
my $old_node = $sd->{node};
+ my $old_try = $sd->{start_try};
die "no state change" if $old_state eq $new_state; # just to be sure
@@ -194,6 +195,7 @@ my $change_service_state = sub {
$sd->{state} = $new_state;
$sd->{node} = $old_node;
+ $sd->{start_try} = $old_try;
my $text_state = '';
foreach my $k (sort keys %params) {
@@ -294,7 +296,7 @@ sub manage {
next if $ss->{$sid}; # already there
$haenv->log('info', "adding new service '$sid' on node '$sc->{$sid}->{node}'");
# assume we are running to avoid relocate running service at add
- $ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node},
+ $ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node}, start_try => 0,
uid => compute_new_uuid('started') };
}
@@ -552,8 +554,27 @@ sub next_state_started {
} else {
my $try_next = 0;
- if ($lrm_res && ($lrm_res->{exit_code} != 0)) { # fixme: other exit codes?
- $try_next = 1;
+ if(defined($lrm_res)) {
+ if ($lrm_res->{exit_code} == 0) {
+ $sd->{start_try} = 0;
+ } else {
+ $try_next = 1;
+
+ $sd->{start_try} = 0 if !defined($sd->{start_trial});
+
+ my $max_start_tries = 3; # fixme: make configurable
+
+ if($sd->{start_try} >= $max_start_tries) {
+ $haenv->log('err', "service $sid couldn't be started after " .
+ "$max_start_tries tries and needs manual intervention");
+ $sd->{trial} = 0;
+ &$change_service_state($self, $sid, 'error');
+ return;
+ }
+
+ $sd->{start_try}++;
+
+ }
}
my $node = select_service_node($self->{groups}, $self->{online_node_usage},
--
2.1.4
More information about the pve-devel
mailing list