[pve-devel] [RFC] implement recovery policy for services
Thomas Lamprecht
t.lamprecht at proxmox.com
Fri Sep 11 17:23:01 CEST 2015
RFC, variable names and some log message may change or get removed.
We implement recovery policies almost identical to rgmanager
There are the following policies which kick in on an failed
service start:
* restart: restart an service on the same node, whereas restarts
are limited to an maximal value which (later) can be
configured.
This policy gets enforced by the LRM.
* relocate: migrates the service to another node and tries to
start it there, also limited to an maximal migration
try count.
This policy gets enforced by the CRM.
* everything: [won't be final name, suggestions welcome]
this does both, first tries to restart the service on
the actual node, after max tries it will relocate
the service and then try again. This means altogether
there are MAX_RELOCATE * MAX_RESTARTS tries to get
the service running.
If any policy fails the service state switches to 'error' which
means that the service needs to be checked and disabled manually.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Env/PVE2.pm | 3 +--
src/PVE/HA/LRM.pm | 47 +++++++++++++++++++++++++++++++++++++++++++++++
src/PVE/HA/Manager.pm | 32 ++++++++++++++++++++++++++++++--
src/PVE/HA/Resources.pm | 9 +++++++++
4 files changed, 87 insertions(+), 4 deletions(-)
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index d508922..bcf8cd5 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -97,6 +97,7 @@ sub read_service_config {
my $d = $res->{ids}->{$sid};
my (undef, undef, $name) = PVE::HA::Tools::parse_sid($sid);
$d->{state} = 'enabled' if !defined($d->{state});
+ $d->{recovery} = 'restart' if !defined($d->{recovery});
if (PVE::HA::Resources->lookup($d->{type})) {
if (my $vmd = $vmlist->{ids}->{$name}) {
if (!$vmd) {
@@ -392,8 +393,6 @@ sub exec_resource_agent {
if ($cmd eq 'started') {
- # fixme: count failures
-
return 0 if $running;
$self->log("info", "starting service $sid");
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index bc8ed52..e5145d8 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -448,6 +448,8 @@ sub resource_command_finished {
$exit_code = ($status >> 8);
}
+ $exit_code = $self->handle_service_exitcode($sid, $w->{state}, $exit_code);
+
$self->{results}->{$uid} = {
sid => $w->{sid},
state => $w->{state},
@@ -472,4 +474,49 @@ sub resource_command_finished {
$self->{results} = $results;
}
+
+my $service_start_trial = {};
+
+sub handle_service_exitcode {
+ my ($self, $sid, $cmd, $exit_code) = @_;
+
+ my $haenv = $self->{haenv};
+
+ my $sc = $haenv->read_service_config();
+ my $cd = $sc->{$sid};
+
+ if ($cmd eq 'started') {
+
+ if($exit_code == 0) {
+
+ $service_start_trial->{$sid} = 0;
+
+ return $exit_code;
+
+ } elsif ($exit_code == 1 &&
+ $cd->{recovery} eq 'restart' ||
+ $cd->{recovery} eq 'everything') {
+
+ my $trial = $service_start_trial->{$sid} || 0;
+
+ my $failure_count = 3; # fixme: outsource to cfg
+
+ $trial++;
+ if ($trial >= $failure_count) {
+ $haenv->log('err', "unable to start service $sid after $failure_count tries");
+ $service_start_trial->{$sid} = 0;
+ return 1;
+ }
+
+ $haenv->log('info', "unable to start service $sid on retry: $trial");
+ $service_start_trial->{$sid} = $trial;
+
+ return 2;
+ }
+ }
+
+ return $exit_code;
+
+}
+
1;
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 746c1da..ad3fb43 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -513,6 +513,8 @@ sub next_state_stopped {
$haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration");
}
+my $service_migrate_trial = {};
+
sub next_state_started {
my ($self, $sid, $cd, $sd, $lrm_res) = @_;
@@ -552,8 +554,34 @@ sub next_state_started {
} else {
my $try_next = 0;
- if ($lrm_res && ($lrm_res->{exit_code} != 0)) { # fixme: other exit codes?
- $try_next = 1;
+ if ($lrm_res) {
+ if ($lrm_res->{exit_code} == 1) {
+ if($cd->{recovery} eq 'relocate' ||
+ $cd->{recovery} eq 'everything') {
+
+ my $max_tries = 3; # fixme: outsource to cfg
+
+ my $try = $service_migrate_trial->{$sid} || 0;
+
+ $try++;
+ if ($try >= $max_tries) {
+ $service_migrate_trial->{$sid} = 0;
+ $haenv->log('err', "relocate recovery policy for".
+ "service $sid failed");
+ &$change_service_state($self, $sid, 'error');
+ return;
+ }
+
+ $try_next = 1;
+ $haenv->log('info', "start failed, relocating service ".
+ "$sid (try $service_migrate_trial->{$sid})");
+ $service_migrate_trial->{$sid} = $try;
+ } else {
+ &$change_service_state($self, $sid, 'error');
+ }
+ } elsif($lrm_res->{exit_code} == 0) {
+ $service_migrate_trial->{$sid} = 0;
+ }
}
my $node = select_service_node($self->{groups}, $self->{online_node_usage},
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 2bdebb9..9065b22 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -21,6 +21,13 @@ my $defaultData = {
optional => 1,
default => 'enabled',
},
+ recovery => {
+ description => "Recovery policy.",
+ type => 'string',
+ enum => ['restart', 'relocate', 'disable', 'everything'],
+ optional => 1,
+ default => 'restart',
+ },
group => get_standard_option('pve-ha-group-id', { optional => 1 }),
comment => {
description => "Description.",
@@ -116,6 +123,7 @@ sub options {
state => { optional => 1 },
group => { optional => 1 },
comment => { optional => 1 },
+ recovery => { optional => 1 }
};
}
@@ -180,6 +188,7 @@ sub options {
state => { optional => 1 },
group => { optional => 1 },
comment => { optional => 1 },
+ recovery => { optional => 1 }
};
}
--
2.1.4
More information about the pve-devel
mailing list