[pve-devel] [PATCH ha-manager 4/6] send email on fence failure and success
Thomas Lamprecht
t.lamprecht at proxmox.com
Wed Jun 1 14:56:46 CEST 2016
Fencing is something which should not happen often in the real world
and has most time a really bad cause, thus send a email when
starting to fence a node and on success to root at localhost to inform
the cluster admin of said failures so he can check the hardware and
cluster status as soon as possible.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/NodeStatus.pm | 39 +++++++++++++++++++++++++++++++++++++-
src/test/test-basic1/log.expect | 2 ++
src/test/test-basic2/log.expect | 1 +
src/test/test-basic5/log.expect | 2 ++
src/test/test-shutdown1/log.expect | 2 ++
src/test/test-shutdown2/log.expect | 2 ++
src/test/test-shutdown3/log.expect | 2 ++
src/test/test-shutdown4/log.expect | 2 ++
8 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/src/PVE/HA/NodeStatus.pm b/src/PVE/HA/NodeStatus.pm
index d9ef912..632dbd4 100644
--- a/src/PVE/HA/NodeStatus.pm
+++ b/src/PVE/HA/NodeStatus.pm
@@ -3,6 +3,7 @@ package PVE::HA::NodeStatus;
use strict;
use warnings;
+use JSON;
use Data::Dumper;
my $fence_delay = 60;
@@ -169,6 +170,38 @@ sub update {
}
}
+# assembles a commont text for fence emails
+my $send_fence_state_email = sub {
+ my ($self, $subject_prefix, $subject, $node) = @_;
+
+ my $haenv = $self->{haenv};
+
+ my $mail_text = <<EOF
+The node '$node' failed and needs manual intervention.
+
+The PVE HA manager tries to fence it and recover the
+configured HA resources to a healthy node if possible.
+
+Current fence status: $subject_prefix
+$subject
+
+
+Overall Cluster status:
+-----------------------
+
+EOF
+;
+ my $mail_subject = $subject_prefix . ': ' . $subject;
+
+ my $status = $haenv->read_manager_status();
+ my $data = { manager_status => $status, node_status => $self->{status} };
+
+ $mail_text .= to_json($data, { pretty => 1, canonical => 1});
+
+ $haenv->sendmail($mail_subject, $mail_text);
+};
+
+
# start fencing
sub fence_node {
my ($self, $node) = @_;
@@ -179,13 +212,17 @@ sub fence_node {
if ($state ne 'fence') {
&$set_node_state($self, $node, 'fence');
+ my $msg = "Try to fence node '$node'";
+ &$send_fence_state_email($self, 'FENCE', $msg, $node);
}
my $success = $haenv->get_ha_agent_lock($node);
if ($success) {
- $haenv->log("info", "fencing: acknowleged - got agent lock for node '$node'");
+ my $msg = "fencing: acknowleged - got agent lock for node '$node'";
+ $haenv->log("info", $msg);
&$set_node_state($self, $node, 'unknown');
+ &$send_fence_state_email($self, 'SUCEED', $msg, $node);
}
return $success;
diff --git a/src/test/test-basic1/log.expect b/src/test/test-basic1/log.expect
index 68df71b..250b918 100644
--- a/src/test/test-basic1/log.expect
+++ b/src/test/test-basic1/log.expect
@@ -36,6 +36,7 @@ info 124 node3/crm: status change slave => wait_for_quorum
info 125 node3/lrm: status change active => lost_agent_lock
info 160 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
info 160 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai 160 node1/crm: FENCE: Try to fence node 'node3'
info 166 watchdog: execute power node3 off
info 165 node3/crm: killed by poweroff
info 166 node3/lrm: killed by poweroff
@@ -43,6 +44,7 @@ info 166 hardware: server 'node3' stopped by poweroff (watchdog)
info 240 node1/crm: got lock 'ha_agent_node3_lock'
info 240 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 240 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai 240 node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
info 240 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node2'
info 240 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node2)
info 243 node2/lrm: starting service vm:103
diff --git a/src/test/test-basic2/log.expect b/src/test/test-basic2/log.expect
index 72822ce..f20d09c 100644
--- a/src/test/test-basic2/log.expect
+++ b/src/test/test-basic2/log.expect
@@ -11,6 +11,7 @@ info 22 node3/crm: node 'node2': state changed from 'online' => 'unknown'
info 22 node3/crm: got lock 'ha_agent_node1_lock'
info 22 node3/crm: fencing: acknowleged - got agent lock for node 'node1'
info 22 node3/crm: node 'node1': state changed from 'fence' => 'unknown'
+emai 22 node3/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node1'
info 22 node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node3'
info 22 node3/crm: service 'vm:101': state changed from 'fence' to 'started' (node = node3)
info 23 node3/lrm: got lock 'ha_agent_node3_lock'
diff --git a/src/test/test-basic5/log.expect b/src/test/test-basic5/log.expect
index 54b579c..8131797 100644
--- a/src/test/test-basic5/log.expect
+++ b/src/test/test-basic5/log.expect
@@ -43,9 +43,11 @@ info 222 node3/crm: status change slave => master
info 222 node3/crm: node 'node1': state changed from 'online' => 'unknown'
info 282 node3/crm: service 'vm:101': state changed from 'started' to 'fence'
info 282 node3/crm: node 'node1': state changed from 'unknown' => 'fence'
+emai 282 node3/crm: FENCE: Try to fence node 'node1'
info 282 node3/crm: got lock 'ha_agent_node1_lock'
info 282 node3/crm: fencing: acknowleged - got agent lock for node 'node1'
info 282 node3/crm: node 'node1': state changed from 'fence' => 'unknown'
+emai 282 node3/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node1'
info 282 node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node2'
info 282 node3/crm: service 'vm:101': state changed from 'fence' to 'started' (node = node2)
info 301 node2/lrm: starting service vm:101
diff --git a/src/test/test-shutdown1/log.expect b/src/test/test-shutdown1/log.expect
index 9dbdb84..185e4ff 100644
--- a/src/test/test-shutdown1/log.expect
+++ b/src/test/test-shutdown1/log.expect
@@ -31,9 +31,11 @@ info 145 shutdown: execute power node3 off
info 160 node1/crm: node 'node3': state changed from 'online' => 'unknown'
info 200 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
info 200 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai 200 node1/crm: FENCE: Try to fence node 'node3'
info 200 node1/crm: got lock 'ha_agent_node3_lock'
info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai 200 node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
diff --git a/src/test/test-shutdown2/log.expect b/src/test/test-shutdown2/log.expect
index c3fbb07..719bff6 100644
--- a/src/test/test-shutdown2/log.expect
+++ b/src/test/test-shutdown2/log.expect
@@ -31,9 +31,11 @@ info 145 shutdown: execute power node3 off
info 160 node1/crm: node 'node3': state changed from 'online' => 'unknown'
info 200 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
info 200 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai 200 node1/crm: FENCE: Try to fence node 'node3'
info 200 node1/crm: got lock 'ha_agent_node3_lock'
info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai 200 node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
diff --git a/src/test/test-shutdown3/log.expect b/src/test/test-shutdown3/log.expect
index 16d8c4e..6822925 100644
--- a/src/test/test-shutdown3/log.expect
+++ b/src/test/test-shutdown3/log.expect
@@ -31,9 +31,11 @@ info 145 shutdown: execute power node3 off
info 160 node1/crm: node 'node3': state changed from 'online' => 'unknown'
info 200 node1/crm: service 'ct:103': state changed from 'started' to 'fence'
info 200 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai 200 node1/crm: FENCE: Try to fence node 'node3'
info 200 node1/crm: got lock 'ha_agent_node3_lock'
info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai 200 node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: recover service 'ct:103' from fenced node 'node3' to node 'node1'
info 200 node1/crm: service 'ct:103': state changed from 'fence' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
diff --git a/src/test/test-shutdown4/log.expect b/src/test/test-shutdown4/log.expect
index 843104b..47bc700 100644
--- a/src/test/test-shutdown4/log.expect
+++ b/src/test/test-shutdown4/log.expect
@@ -34,9 +34,11 @@ info 141 node2/crm: status change slave => master
info 141 node2/crm: node 'node1': state changed from 'online' => 'unknown'
info 220 node2/crm: service 'vm:100': state changed from 'started' to 'fence'
info 220 node2/crm: node 'node1': state changed from 'unknown' => 'fence'
+emai 220 node2/crm: FENCE: Try to fence node 'node1'
info 220 node2/crm: got lock 'ha_agent_node1_lock'
info 220 node2/crm: fencing: acknowleged - got agent lock for node 'node1'
info 220 node2/crm: node 'node1': state changed from 'fence' => 'unknown'
+emai 220 node2/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node1'
info 220 node2/crm: recover service 'vm:100' from fenced node 'node1' to node 'node2'
info 220 node2/crm: service 'vm:100': state changed from 'fence' to 'started' (node = node2)
info 221 node2/lrm: got lock 'ha_agent_node2_lock'
--
2.1.4
More information about the pve-devel
mailing list