[pve-devel] [RFC ha-manager 3/3] always fence nodes on dead LRM

Thomas Lamprecht t.lamprecht at proxmox.com
Tue Apr 26 10:55:29 CEST 2016


fixes a recovery failure if a node starts up with a dead/broken LRM
but working corosync.

So while its quorate it doesn't do anything but the CRM won't fence
it as our "last_online" timestamp only checks if quorate, not if the
HA manager is actually working.

Can be reproduced with having a active node with services, simply
disable the lrm:
$ systemctl disable pve-ha-lrm
and then reboot.
(this would simulate a broken update/reboot)
So the node gets up again and gains quorum but the LRM does not
start and thus no service gets started/migrated/... fencing is
appropriate for such a situation.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---

Note the changes in the regression test are a result of a dead LRM
which comes online again, with those changes the LRM needs to update
his status timestamp first befor the CRM marks is at online, this
ensures that the node is fully online and working not that "just"
corosync runs.

 src/PVE/HA/Manager.pm              | 14 +++++++++-----
 src/PVE/HA/NodeStatus.pm           | 24 ++++++++++++++++++------
 src/test/test-shutdown2/log.expect | 20 ++++++++++----------
 src/test/test-shutdown3/log.expect | 24 ++++++++++++------------
 4 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 54e99b5..e621584 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -258,9 +258,11 @@ sub read_lrm_status {
 
     my $results = {};
     my $modes = {};
+    my $timestamps = {};
     foreach my $node (@$nodes) {
 	my $lrm_status = $haenv->read_lrm_status($node);
 	$modes->{$node} = $lrm_status->{mode} || 'active';
+	$timestamps->{$node} = $lrm_status->{timestamp} || $haenv->get_time();
 	foreach my $uid (keys %{$lrm_status->{results}}) {
 	    next if $results->{$uid}; # should not happen
 	    $results->{$uid} = $lrm_status->{results}->{$uid};
@@ -268,7 +270,7 @@ sub read_lrm_status {
     }
 
     
-    return ($results, $modes);
+    return ($results, $modes, $timestamps);
 }
 
 # read new crm commands and save them into crm master status
@@ -311,15 +313,17 @@ sub manage {
 
     my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss});
 
-    $ns->update($haenv->get_node_info());
+    my ($lrm_results, $lrm_modes, $timestamps) = $self->read_lrm_status();
 
-    if (!$ns->node_is_online($haenv->nodename())) {
+    my ($node_info, $quorate) = $haenv->get_node_info();
+
+    $ns->update($node_info, $timestamps);
+
+    if (!($ns->node_is_online($haenv->nodename()) || $quorate)) {
 	$haenv->log('info', "master seems offline");
 	return;
     }
 
-    my ($lrm_results, $lrm_modes) = $self->read_lrm_status();
-
     my $sc = $haenv->read_service_config();
 
     $self->{groups} = $haenv->read_group_config(); # update
diff --git a/src/PVE/HA/NodeStatus.pm b/src/PVE/HA/NodeStatus.pm
index d9ef912..3a54d3c 100644
--- a/src/PVE/HA/NodeStatus.pm
+++ b/src/PVE/HA/NodeStatus.pm
@@ -117,7 +117,7 @@ my $set_node_state = sub {
 };
 
 sub update {
-    my ($self, $node_info) = @_;
+    my ($self, $node_info, $timestamps) = @_;
 
     my $haenv = $self->{haenv};
 
@@ -125,20 +125,32 @@ sub update {
 	my $d = $node_info->{$node};
 	next if !$d->{online};
 
-	# record last time the node was online (required to implement fence delay)
-	$self->{last_online}->{$node} = $haenv->get_time();
-
 	my $state = $self->get_node_state($node);
 
+	my $ctime = $haenv->get_time();
+
 	if ($state eq 'online') {
-	    # &$set_node_state($self, $node, 'online');
+	    # if a node is quorate but its LRM is dead mark it as 'unknown'
+	    # to allow a possible needed service recovery
+	    if (defined($timestamps->{$node}) && ($ctime - $timestamps->{$node}) >= $fence_delay) {
+		&$set_node_state($self, $node, 'unknown');
+		next;
+	    }
 	} elsif ($state eq 'unknown' || $state eq 'gone') {
-	    &$set_node_state($self, $node, 'online');
+	    # mark new nodes or quorate with active LRM nodes as online
+	    if (!defined($timestamps->{$node}) || ($ctime - $timestamps->{$node}) < $fence_delay) {
+		&$set_node_state($self, $node, 'online');
+	    } else {
+		next;
+	    }
 	} elsif ($state eq 'fence') {
 	    # do nothing, wait until fenced
 	} else {
 	    die "detected unknown node state '$state";
 	}
+
+	# record last time the node was online (required to implement fence delay)
+	$self->{last_online}->{$node} = $ctime;
     }
 
     foreach my $node (keys %{$self->{status}}) {
diff --git a/src/test/test-shutdown2/log.expect b/src/test/test-shutdown2/log.expect
index c3fbb07..5a9ad0f 100644
--- a/src/test/test-shutdown2/log.expect
+++ b/src/test/test-shutdown2/log.expect
@@ -43,15 +43,15 @@ info    201    node1/lrm: service status vm:103 started
 info    500      cmdlist: execute power node3 on
 info    500    node3/crm: status change startup => wait_for_quorum
 info    500    node3/lrm: status change startup => wait_for_agent_lock
-info    500    node1/crm: node 'node3': state changed from 'unknown' => 'online'
-info    500    node1/crm: migrate service 'vm:103' to node 'node3' (running)
-info    500    node1/crm: service 'vm:103': state changed from 'started' to 'migrate'  (node = node1, target = node3)
-info    501    node1/lrm: service vm:103 - start migrate to node 'node3'
-info    501    node1/lrm: service vm:103 - end migrate to node 'node3'
 info    504    node3/crm: status change wait_for_quorum => slave
-info    520    node1/crm: service 'vm:103': state changed from 'migrate' to 'started'  (node = node3)
-info    525    node3/lrm: got lock 'ha_agent_node3_lock'
-info    525    node3/lrm: status change wait_for_agent_lock => active
-info    525    node3/lrm: starting service vm:103
-info    525    node3/lrm: service status vm:103 started
+info    520    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info    520    node1/crm: migrate service 'vm:103' to node 'node3' (running)
+info    520    node1/crm: service 'vm:103': state changed from 'started' to 'migrate'  (node = node1, target = node3)
+info    521    node1/lrm: service vm:103 - start migrate to node 'node3'
+info    521    node1/lrm: service vm:103 - end migrate to node 'node3'
+info    540    node1/crm: service 'vm:103': state changed from 'migrate' to 'started'  (node = node3)
+info    545    node3/lrm: got lock 'ha_agent_node3_lock'
+info    545    node3/lrm: status change wait_for_agent_lock => active
+info    545    node3/lrm: starting service vm:103
+info    545    node3/lrm: service status vm:103 started
 info   1100     hardware: exit simulation - done
diff --git a/src/test/test-shutdown3/log.expect b/src/test/test-shutdown3/log.expect
index 16d8c4e..86ba22d 100644
--- a/src/test/test-shutdown3/log.expect
+++ b/src/test/test-shutdown3/log.expect
@@ -43,17 +43,17 @@ info    201    node1/lrm: service status ct:103 started
 info    500      cmdlist: execute power node3 on
 info    500    node3/crm: status change startup => wait_for_quorum
 info    500    node3/lrm: status change startup => wait_for_agent_lock
-info    500    node1/crm: node 'node3': state changed from 'unknown' => 'online'
-info    500    node1/crm: relocate service 'ct:103' to node 'node3'
-info    500    node1/crm: service 'ct:103': state changed from 'started' to 'relocate'  (node = node1, target = node3)
-info    501    node1/lrm: service ct:103 - start relocate to node 'node3'
-info    501    node1/lrm: stopping service ct:103 (relocate)
-info    501    node1/lrm: service status ct:103 stopped
-info    501    node1/lrm: service ct:103 - end relocate to node 'node3'
 info    504    node3/crm: status change wait_for_quorum => slave
-info    520    node1/crm: service 'ct:103': state changed from 'relocate' to 'started'  (node = node3)
-info    525    node3/lrm: got lock 'ha_agent_node3_lock'
-info    525    node3/lrm: status change wait_for_agent_lock => active
-info    525    node3/lrm: starting service ct:103
-info    525    node3/lrm: service status ct:103 started
+info    520    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info    520    node1/crm: relocate service 'ct:103' to node 'node3'
+info    520    node1/crm: service 'ct:103': state changed from 'started' to 'relocate'  (node = node1, target = node3)
+info    521    node1/lrm: service ct:103 - start relocate to node 'node3'
+info    521    node1/lrm: stopping service ct:103 (relocate)
+info    521    node1/lrm: service status ct:103 stopped
+info    521    node1/lrm: service ct:103 - end relocate to node 'node3'
+info    540    node1/crm: service 'ct:103': state changed from 'relocate' to 'started'  (node = node3)
+info    545    node3/lrm: got lock 'ha_agent_node3_lock'
+info    545    node3/lrm: status change wait_for_agent_lock => active
+info    545    node3/lrm: starting service ct:103
+info    545    node3/lrm: service status ct:103 started
 info   1100     hardware: exit simulation - done
-- 
2.1.4





More information about the pve-devel mailing list