[pve-devel] [PATCH pve-ha-manager 1/4] implement recovery policy for services
Thomas Lamprecht
t.lamprecht at proxmox.com
Wed Sep 16 11:25:15 CEST 2015
We implement recovery policies which use settings known from
rgmanager, however the behaviour is not strictly the same,
our approach is more configurable. For example rgmanager cannot
combine its restart and relocate policy.
There are the following policy settings which kick in on an failed
service start:
* max_restart: maxmial number of tries to restart an failed service
on the actual node. The default is 1 restart try.
This policy gets enforced by the LRM.
* max_relocate: maximal number of tries to relocate the service to a
a different node. A relocate only takes place after
the max_restart value is exceeded on the actual node
This policy gets enforced by the CRM.
If a service is still no running after all max tries, it's state
gets set to 'error'. This means that the service needs to be checked
and disabled manually.
*Note* that the relocate state will only reset when the service had
at least one successful start. That means if a service is reenabled
without fixing the error only the restart policy gets repeated.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Env/PVE2.pm | 4 ++--
src/PVE/HA/LRM.pm | 43 +++++++++++++++++++++++++++++++++++++++++++
src/PVE/HA/Manager.pm | 28 ++++++++++++++++++++++++++--
src/PVE/HA/Resources.pm | 20 ++++++++++++++++++++
4 files changed, 91 insertions(+), 4 deletions(-)
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index d508922..5d126b0 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -97,6 +97,8 @@ sub read_service_config {
my $d = $res->{ids}->{$sid};
my (undef, undef, $name) = PVE::HA::Tools::parse_sid($sid);
$d->{state} = 'enabled' if !defined($d->{state});
+ $d->{max_restart} = 1 if !defined($d->{max_restart});
+ $d->{max_relocate} = 1 if !defined($d->{max_relocate});
if (PVE::HA::Resources->lookup($d->{type})) {
if (my $vmd = $vmlist->{ids}->{$name}) {
if (!$vmd) {
@@ -392,8 +394,6 @@ sub exec_resource_agent {
if ($cmd eq 'started') {
- # fixme: count failures
-
return 0 if $running;
$self->log("info", "starting service $sid");
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index bc8ed52..ed2885f 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -29,6 +29,7 @@ sub new {
status => { state => 'startup' },
workers => {},
results => {},
+ restart_tries => {},
shutdown_request => 0,
# mode can be: active, reboot, shutdown, restart
mode => 'active',
@@ -448,6 +449,8 @@ sub resource_command_finished {
$exit_code = ($status >> 8);
}
+ $exit_code = $self->handle_service_exitcode($sid, $w->{state}, $exit_code);
+
$self->{results}->{$uid} = {
sid => $w->{sid},
state => $w->{state},
@@ -472,4 +475,44 @@ sub resource_command_finished {
$self->{results} = $results;
}
+# processes the exit code from a finished resource agent, so that the CRM knows
+# if the LRM wants to retry an action based on the current recovery policies for
+# the failed service, or the CRM itself must try to recover from the failure.
+sub handle_service_exitcode {
+ my ($self, $sid, $cmd, $exit_code) = @_;
+
+ my $haenv = $self->{haenv};
+ my $tries = $self->{restart_tries};
+
+ my $sc = $haenv->read_service_config();
+ my $cd = $sc->{$sid};
+
+ if ($cmd eq 'started') {
+
+ if ($exit_code == 0) {
+
+ $tries->{$sid} = 0;
+
+ return $exit_code;
+
+ } elsif ($exit_code == 1) {
+
+ $tries->{$sid} = 0 if !defined($tries->{$sid});
+
+ $tries->{$sid}++;
+ if ($tries->{$sid} >= $cd->{max_restart}) {
+ $haenv->log('err', "unable to start service $sid on local node".
+ " after $tries->{$sid} retries");
+ $tries->{$sid} = 0;
+ return 1;
+ }
+
+ return 2;
+ }
+ }
+
+ return $exit_code;
+
+}
+
1;
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 746c1da..fe15872 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -517,6 +517,7 @@ sub next_state_started {
my ($self, $sid, $cd, $sd, $lrm_res) = @_;
my $haenv = $self->{haenv};
+ my $master_status = $self->{ms};
my $ns = $self->{ns};
if (!$ns->node_is_online($sd->{node})) {
@@ -552,8 +553,31 @@ sub next_state_started {
} else {
my $try_next = 0;
- if ($lrm_res && ($lrm_res->{exit_code} != 0)) { # fixme: other exit codes?
- $try_next = 1;
+ if ($lrm_res) {
+ if ($lrm_res->{exit_code} == 1) {
+
+ my $try = $master_status->{relocate_trial}->{$sid} || 0;
+
+ if($try < $cd->{max_relocate}) {
+
+ $try++;
+ $try_next = 1; # tell select_service_node to relocate
+
+ $haenv->log('warning', "starting service $sid on node".
+ " '$sd->{node}' failed, relocating service.");
+ $master_status->{relocate_trial}->{$sid} = $try;
+
+ } else {
+
+ $haenv->log('err', "recovery policy for service".
+ " $sid failed, entering error state!");
+ &$change_service_state($self, $sid, 'error');
+ return;
+
+ }
+ } elsif($lrm_res->{exit_code} == 0) {
+ $master_status->{relocate_trial}->{$sid} = 0;
+ }
}
my $node = select_service_node($self->{groups}, $self->{online_node_usage},
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 2543586..c41fa91 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -25,6 +25,22 @@ my $defaultData = {
group => get_standard_option('pve-ha-group-id',
{ optional => 1,
completion => \&PVE::HA::Tools::complete_group }),
+ max_restart => {
+ description => "Maximal number of tries to restart the service on".
+ " a node after its start failed.",
+ type => 'integer',
+ optional => 1,
+ default => 1,
+ minimum => 0,
+ },
+ max_relocate => {
+ description => "Maximal number of service relocate tries when a".
+ " service failes to start.",
+ type => 'integer',
+ optional => 1,
+ default => 1,
+ minimum => 0,
+ },
comment => {
description => "Description.",
type => 'string',
@@ -119,6 +135,8 @@ sub options {
state => { optional => 1 },
group => { optional => 1 },
comment => { optional => 1 },
+ max_restart => { optional => 1 },
+ max_relocate => { optional => 1 },
};
}
@@ -183,6 +201,8 @@ sub options {
state => { optional => 1 },
group => { optional => 1 },
comment => { optional => 1 },
+ max_restart => { optional => 1 },
+ max_relocate => { optional => 1 },
};
}
--
2.1.4
More information about the pve-devel
mailing list