[pve-devel] [PATCH ha-manager] send email on fence failure and success

Thomas Lamprecht t.lamprecht at proxmox.com
Fri Apr 8 17:14:23 CEST 2016


Send a email when starting to fence a node and on either success or
failure.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---

same as the other email, something i rebased (I checked the history!)
was missing, namely the sendemail on fence start, the regression test had the
respective change already. strange, but friday late after noon so...)

 src/PVE/HA/NodeStatus.pm           | 45 +++++++++++++++++++++++++++++++++++---
 src/test/test-basic1/log.expect    |  2 ++
 src/test/test-basic2/log.expect    |  1 +
 src/test/test-basic5/log.expect    |  2 ++
 src/test/test-hw-fence1/log.expect |  2 ++
 src/test/test-shutdown1/log.expect |  2 ++
 src/test/test-shutdown2/log.expect |  2 ++
 src/test/test-shutdown3/log.expect |  2 ++
 src/test/test-shutdown4/log.expect |  2 ++
 9 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/src/PVE/HA/NodeStatus.pm b/src/PVE/HA/NodeStatus.pm
index 6b29f9c..0a410ba 100644
--- a/src/PVE/HA/NodeStatus.pm
+++ b/src/PVE/HA/NodeStatus.pm
@@ -2,6 +2,9 @@ package PVE::HA::NodeStatus;
 
 use strict;
 use warnings;
+
+use JSON;
+
 use PVE::HA::Fence;
 
 use Data::Dumper;
@@ -166,6 +169,37 @@ sub update {
    }
 }
 
+# assembles a commont text for fence emails
+my $send_fence_state_email = sub {
+    my ($self, $subject_prefix, $subject, $node) = @_;
+
+    my $haenv = $self->{haenv};
+
+    my $mail_text = <<EOF
+The node '$node' failed and needs manual intervention.
+
+The PVE HA manager tries  to fence it and recover the
+configured HA resources to a healthy node if possible.
+
+Current fence status:  $subject_prefix
+$subject
+
+
+Overall Cluster status:
+-----------------------
+
+EOF
+;
+    my $mail_subject = $subject_prefix . ': ' . $subject;
+
+    my $status = $haenv->read_manager_status();
+    my $data = { manager_status => $status, node_status => $self->{status} };
+
+    $mail_text .= to_json($data, { pretty => 1, canonical => 1});
+
+    $haenv->sendmail($mail_subject, $mail_text);
+};
+
 # start fencing
 sub fence_node {
     my ($self, $node) = @_;
@@ -176,6 +210,8 @@ sub fence_node {
 
     if ($state ne 'fence') {
 	&$set_node_state($self, $node, 'fence');
+	my $msg = "Try to fence node '$node'";
+	&$send_fence_state_email($self, 'FENCE', $msg, $node);
     }
 
     my ($success, $hw_fence_success) = (0, 0);
@@ -188,8 +224,9 @@ sub fence_node {
 
 	# bad fence.cfg or no devices and only hardware fencing configured
 	if ($hw_fence_success < 0 && $fencing_mode eq 'hardware') {
-	    $haenv->log('err', "Fencing of node '$node' failed and needs " .
-			"manual intervention!");
+	    my $msg = "Fencing of node '$node' failed and needs manual intervention!";
+	    $haenv->log('err', $msg);
+	    &$send_fence_state_email($self, 'FAILED', $msg, $node);
 	    return 0;
 	}
 
@@ -216,8 +253,10 @@ sub fence_node {
     }
 
     if ($success) {
-	$haenv->log("info", "fencing: acknowleged - got agent lock for node '$node'");
+	my $msg = "fencing: acknowleged - got agent lock for node '$node'";
+	$haenv->log("info", $msg);
 	&$set_node_state($self, $node, 'unknown');
+	&$send_fence_state_email($self, 'SUCEED', $msg, $node);
 	PVE::HA::Fence::kill_and_cleanup_jobs($node) if ($fencing_mode ne 'watchdog');
     }
 
diff --git a/src/test/test-basic1/log.expect b/src/test/test-basic1/log.expect
index 68df71b..250b918 100644
--- a/src/test/test-basic1/log.expect
+++ b/src/test/test-basic1/log.expect
@@ -36,6 +36,7 @@ info    124    node3/crm: status change slave => wait_for_quorum
 info    125    node3/lrm: status change active => lost_agent_lock
 info    160    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
 info    160    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai    160    node1/crm: FENCE: Try to fence node 'node3'
 info    166     watchdog: execute power node3 off
 info    165    node3/crm: killed by poweroff
 info    166    node3/lrm: killed by poweroff
@@ -43,6 +44,7 @@ info    166     hardware: server 'node3' stopped by poweroff (watchdog)
 info    240    node1/crm: got lock 'ha_agent_node3_lock'
 info    240    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
 info    240    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai    240    node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
 info    240    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node2'
 info    240    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node2)
 info    243    node2/lrm: starting service vm:103
diff --git a/src/test/test-basic2/log.expect b/src/test/test-basic2/log.expect
index 72822ce..f20d09c 100644
--- a/src/test/test-basic2/log.expect
+++ b/src/test/test-basic2/log.expect
@@ -11,6 +11,7 @@ info     22    node3/crm: node 'node2': state changed from 'online' => 'unknown'
 info     22    node3/crm: got lock 'ha_agent_node1_lock'
 info     22    node3/crm: fencing: acknowleged - got agent lock for node 'node1'
 info     22    node3/crm: node 'node1': state changed from 'fence' => 'unknown'
+emai     22    node3/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node1'
 info     22    node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node3'
 info     22    node3/crm: service 'vm:101': state changed from 'fence' to 'started'  (node = node3)
 info     23    node3/lrm: got lock 'ha_agent_node3_lock'
diff --git a/src/test/test-basic5/log.expect b/src/test/test-basic5/log.expect
index 54b579c..8131797 100644
--- a/src/test/test-basic5/log.expect
+++ b/src/test/test-basic5/log.expect
@@ -43,9 +43,11 @@ info    222    node3/crm: status change slave => master
 info    222    node3/crm: node 'node1': state changed from 'online' => 'unknown'
 info    282    node3/crm: service 'vm:101': state changed from 'started' to 'fence'
 info    282    node3/crm: node 'node1': state changed from 'unknown' => 'fence'
+emai    282    node3/crm: FENCE: Try to fence node 'node1'
 info    282    node3/crm: got lock 'ha_agent_node1_lock'
 info    282    node3/crm: fencing: acknowleged - got agent lock for node 'node1'
 info    282    node3/crm: node 'node1': state changed from 'fence' => 'unknown'
+emai    282    node3/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node1'
 info    282    node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node2'
 info    282    node3/crm: service 'vm:101': state changed from 'fence' to 'started'  (node = node2)
 info    301    node2/lrm: starting service vm:101
diff --git a/src/test/test-hw-fence1/log.expect b/src/test/test-hw-fence1/log.expect
index 15555d7..cb0b7a6 100644
--- a/src/test/test-hw-fence1/log.expect
+++ b/src/test/test-hw-fence1/log.expect
@@ -36,6 +36,7 @@ info    124    node3/crm: status change slave => wait_for_quorum
 info    125    node3/lrm: status change active => lost_agent_lock
 info    160    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
 info    160    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai    160    node1/crm: FENCE: Try to fence node 'node3'
 noti    160    node1/crm: Start fencing node 'node3'
 noti    160    node1/crm: [fence 'node3'] execute cmd: fence_virt --ip=127.0.0.1 --plug=102
 info    160   fence_virt: execute power node3 off
@@ -46,6 +47,7 @@ noti    180    node1/crm: fencing of node 'node3' succeeded, trying to get its a
 info    180    node1/crm: got lock 'ha_agent_node3_lock'
 info    180    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
 info    180    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai    180    node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
 info    180    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node2'
 info    180    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node2)
 info    183    node2/lrm: starting service vm:103
diff --git a/src/test/test-shutdown1/log.expect b/src/test/test-shutdown1/log.expect
index 9dbdb84..185e4ff 100644
--- a/src/test/test-shutdown1/log.expect
+++ b/src/test/test-shutdown1/log.expect
@@ -31,9 +31,11 @@ info    145     shutdown: execute power node3 off
 info    160    node1/crm: node 'node3': state changed from 'online' => 'unknown'
 info    200    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
 info    200    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai    200    node1/crm: FENCE: Try to fence node 'node3'
 info    200    node1/crm: got lock 'ha_agent_node3_lock'
 info    200    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
 info    200    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai    200    node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
 info    200    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
 info    200    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node1)
 info    201    node1/lrm: got lock 'ha_agent_node1_lock'
diff --git a/src/test/test-shutdown2/log.expect b/src/test/test-shutdown2/log.expect
index c3fbb07..719bff6 100644
--- a/src/test/test-shutdown2/log.expect
+++ b/src/test/test-shutdown2/log.expect
@@ -31,9 +31,11 @@ info    145     shutdown: execute power node3 off
 info    160    node1/crm: node 'node3': state changed from 'online' => 'unknown'
 info    200    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
 info    200    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai    200    node1/crm: FENCE: Try to fence node 'node3'
 info    200    node1/crm: got lock 'ha_agent_node3_lock'
 info    200    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
 info    200    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai    200    node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
 info    200    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
 info    200    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node1)
 info    201    node1/lrm: got lock 'ha_agent_node1_lock'
diff --git a/src/test/test-shutdown3/log.expect b/src/test/test-shutdown3/log.expect
index 16d8c4e..6822925 100644
--- a/src/test/test-shutdown3/log.expect
+++ b/src/test/test-shutdown3/log.expect
@@ -31,9 +31,11 @@ info    145     shutdown: execute power node3 off
 info    160    node1/crm: node 'node3': state changed from 'online' => 'unknown'
 info    200    node1/crm: service 'ct:103': state changed from 'started' to 'fence'
 info    200    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai    200    node1/crm: FENCE: Try to fence node 'node3'
 info    200    node1/crm: got lock 'ha_agent_node3_lock'
 info    200    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
 info    200    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai    200    node1/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node3'
 info    200    node1/crm: recover service 'ct:103' from fenced node 'node3' to node 'node1'
 info    200    node1/crm: service 'ct:103': state changed from 'fence' to 'started'  (node = node1)
 info    201    node1/lrm: got lock 'ha_agent_node1_lock'
diff --git a/src/test/test-shutdown4/log.expect b/src/test/test-shutdown4/log.expect
index 843104b..47bc700 100644
--- a/src/test/test-shutdown4/log.expect
+++ b/src/test/test-shutdown4/log.expect
@@ -34,9 +34,11 @@ info    141    node2/crm: status change slave => master
 info    141    node2/crm: node 'node1': state changed from 'online' => 'unknown'
 info    220    node2/crm: service 'vm:100': state changed from 'started' to 'fence'
 info    220    node2/crm: node 'node1': state changed from 'unknown' => 'fence'
+emai    220    node2/crm: FENCE: Try to fence node 'node1'
 info    220    node2/crm: got lock 'ha_agent_node1_lock'
 info    220    node2/crm: fencing: acknowleged - got agent lock for node 'node1'
 info    220    node2/crm: node 'node1': state changed from 'fence' => 'unknown'
+emai    220    node2/crm: SUCEED: fencing: acknowleged - got agent lock for node 'node1'
 info    220    node2/crm: recover service 'vm:100' from fenced node 'node1' to node 'node2'
 info    220    node2/crm: service 'vm:100': state changed from 'fence' to 'started'  (node = node2)
 info    221    node2/lrm: got lock 'ha_agent_node2_lock'
-- 
2.1.4





More information about the pve-devel mailing list