[pve-devel] [RFC ha-manager v2 5/7] allow LRM lock stealing for fenced nodes

Thomas Lamprecht t.lamprecht at proxmox.com
Fri Mar 11 16:57:14 CET 2016


We are only allowed to recover (=steal) a service when we have its
LRMs lock, as this guarantees us that even if said LRM comes up
again during the steal operation the LRM cannot start the services
when the service config still belongs to it for a short time.

This is important, else we have a possible race for the resource
which can result in a service started on the old (restarted) node
and the node where the service was recovered too, which is really
bad!
---

The Sim::Env sim_get_lock allows a master now to steal an arbitrary
lock.

 src/PVE/HA/Env.pm      |  4 ++--
 src/PVE/HA/Env/PVE2.pm |  4 ++--
 src/PVE/HA/Sim/Env.pm  | 16 ++++++++++------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/PVE/HA/Env.pm b/src/PVE/HA/Env.pm
index 83774a6..6c3a08e 100644
--- a/src/PVE/HA/Env.pm
+++ b/src/PVE/HA/Env.pm
@@ -153,9 +153,9 @@ sub get_ha_agent_lock {
 # this should only get called if the nodes LRM gracefully shuts down with
 # all services already cleanly stopped!
 sub release_ha_agent_lock {
-    my ($self) = @_;
+    my ($self, $node) = @_;
 
-    return $self->{plug}->release_ha_agent_lock();
+    return $self->{plug}->release_ha_agent_lock($node);
 }
 
 # return true when cluster is quorate
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index 6dd6aa0..bbc9e3d 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -315,9 +315,9 @@ sub get_ha_agent_lock {
 # this should only get called if the nodes LRM gracefully shuts down with
 # all services already cleanly stopped!
 sub release_ha_agent_lock {
-    my ($self) = @_;
+    my ($self, $node) = @_;
 
-    my $node = $self->nodename();
+    $node = $node || $self->nodename();
 
     return rmdir("$lockdir/ha_agent_${node}_lock");
 }
diff --git a/src/PVE/HA/Sim/Env.pm b/src/PVE/HA/Sim/Env.pm
index 2c4b0bc..b26c156 100644
--- a/src/PVE/HA/Sim/Env.pm
+++ b/src/PVE/HA/Sim/Env.pm
@@ -75,13 +75,17 @@ sub sim_get_lock {
 	    if (my $d = $data->{$lock_name}) {
 		my $tdiff = $ctime - $d->{time};
 
+		my $manager_node = $data->{'ha_manager_lock'}->{node} || '';
+
+		$res = 0;
 		if ($tdiff > $self->{lock_timeout}) {
 		    $res = 1;
-		} elsif (($tdiff <= $self->{lock_timeout}) && ($d->{node} eq $nodename)) {
-		    delete $data->{$lock_name};
-		    $res = 1;
 		} else {
-		    $res = 0;
+		    # if we aren't manager we may unlock only *our* lock
+		    if ($d->{node} eq $nodename || $manager_node eq $nodename) {
+			delete $data->{$lock_name};
+			$res = 1;
+		    }
 		}
 	    }
 
@@ -271,9 +275,9 @@ sub get_ha_agent_lock {
 # this should only get called if the nodes LRM gracefully shuts down with
 # all services already cleanly stopped!
 sub release_ha_agent_lock {
-    my ($self) = @_;
+    my ($self, $node) = @_;
 
-    my $node = $self->nodename();
+    $node = $node || $self->nodename();
 
     my $lock = $self->get_ha_agent_lock_name($node);
     return $self->sim_get_lock($lock, 1);
-- 
2.1.4





More information about the pve-devel mailing list