[pve-devel] [PATCH ha-manager v3 5/6] add check if a service is relocatable and assert it on recovery

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Sep 14 11:29:44 CEST 2016


Add a basic check if a service is relocatable, i.e. is bound to
local resources from a node.

Assert this check when a service gets recovered after a failed node
to avoid moving the service only so that it fails to start, even if
we knew beforehand that it cannot start.

Place such services in the error state as they need manual
intervention.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Manager.pm                    |  7 ++++++
 src/PVE/HA/Resources.pm                  |  7 ++++++
 src/PVE/HA/Resources/PVECT.pm            | 17 ++++++++++++++
 src/PVE/HA/Resources/PVEVM.pm            | 20 ++++++++++++++++
 src/PVE/HA/Sim/Resources.pm              | 17 ++++++++++++++
 src/PVE/HA/Tools.pm                      | 16 +++++++++++++
 src/test/test-locked-service2/log.expect | 39 ++++++++++++++++++++++++++++++++
 7 files changed, 123 insertions(+)
 create mode 100644 src/test/test-locked-service2/log.expect

diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index c60df7c..2d1013a 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -261,6 +261,13 @@ my $recover_fenced_service = sub {
 					    $cd, $sd->{node});
 
     if ($recovery_node) {
+	eval { PVE::HA::Tools::check_service_is_relocatable($sid, $haenv, $fenced_node, 1); };
+	if (my $err = $@) {
+	    $haenv->log('err', "service '$sid' not recoverable: $err");
+	    &$change_service_state($self, $sid, 'error');
+	    return;
+	}
+
 	$haenv->log('info', "recover service '$sid' from fenced node " .
 		    "'$fenced_node' to node '$recovery_node'");
 
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 3836fc8..fea65c7 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -124,6 +124,13 @@ sub check_running {
     die "implement in subclass";
 }
 
+sub check_service_is_relocatable {
+    my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+    die "implement in subclass";
+}
+
+
 
 # package PVE::HA::Resources::IPAddr;
 
diff --git a/src/PVE/HA/Resources/PVECT.pm b/src/PVE/HA/Resources/PVECT.pm
index b6ebe2f..fa0237b 100644
--- a/src/PVE/HA/Resources/PVECT.pm
+++ b/src/PVE/HA/Resources/PVECT.pm
@@ -114,4 +114,21 @@ sub check_running {
     return PVE::LXC::check_running($vmid);
 }
 
+sub check_service_is_relocatable {
+    my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+    my $conf = PVE::LXC::Config->load_config($id, $service_node);
+
+    # check for blocking locks, when doing recovery allow safe-to-delete locks
+    my $lock = $conf->{lock};
+    if ($lock && !($nonstrict && ($lock eq 'backup' || $lock eq 'mounted'))) {
+	die "service is locked with lock '$lock'\n" if !$noerr;
+	return undef;
+    }
+
+    # TODO: check more (e.g. storage availability)
+
+    return 1;
+}
+
 1;
diff --git a/src/PVE/HA/Resources/PVEVM.pm b/src/PVE/HA/Resources/PVEVM.pm
index 4c06df9..e3e8c55 100644
--- a/src/PVE/HA/Resources/PVEVM.pm
+++ b/src/PVE/HA/Resources/PVEVM.pm
@@ -116,4 +116,24 @@ sub check_running {
     return PVE::QemuServer::check_running($vmid, 1, $nodename);
 }
 
+sub check_service_is_relocatable {
+    my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+    my $conf = PVE::QemuConfig->load_config($id, $service_node);
+
+    # check for blocking locks, when doing recovery allow safe-to-delete locks
+    my $lock = $conf->{lock};
+    if ($lock && !($nonstrict && $lock eq 'backup')) {
+	die "service is locked with lock '$lock'\n" if !$noerr;
+	return undef;
+    }
+
+    # tell method to die if any local resources are in use
+    return undef if !PVE::QemuServer::check_local_resources($conf, $noerr);
+
+    # TODO: check more (e.g. storage availability)
+
+    return 1;
+}
+
 1;
diff --git a/src/PVE/HA/Sim/Resources.pm b/src/PVE/HA/Sim/Resources.pm
index fe82332..d34e2ea 100644
--- a/src/PVE/HA/Sim/Resources.pm
+++ b/src/PVE/HA/Sim/Resources.pm
@@ -123,5 +123,22 @@ sub migrate {
     return defined($ss->{$sid}) ? 0 : 1;
 }
 
+sub check_service_is_relocatable {
+    my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+    my $sid = $self->type() . ":$id";
+    my $hardware = $haenv->hardware();
+
+    my $conf = $hardware->read_service_config();
+
+    # check for blocking locks, when doing recovery allow safe-to-delete locks
+    my $lock = $conf->{$sid}->{lock};
+    if ($lock && !($nonstrict && ($lock eq 'backup' || $lock eq 'mounted'))) {
+	die "service is locked with lock '$lock'\n" if !$noerr;
+	return undef;
+    }
+
+    return 1;
+}
 
 1;
diff --git a/src/PVE/HA/Tools.pm b/src/PVE/HA/Tools.pm
index 072ef8f..3e72e3e 100644
--- a/src/PVE/HA/Tools.pm
+++ b/src/PVE/HA/Tools.pm
@@ -193,6 +193,22 @@ sub upid_wait {
     PVE::ProcFSTools::upid_wait($upid, $waitfunc, 5);
 }
 
+# checks if a service can be relocated, i.e. if its not bound to a node
+# setting the 'nonstrict' parameter allows some of the checks to fail, e.g.
+# during recovery certain service types may have still a backup left over
+sub check_service_is_relocatable {
+    my ($sid, $haenv, $node, $nonstrict) = @_;
+
+    my (undef, $type, $id) = PVE::HA::Tools::parse_sid($sid);
+    my $plugin = PVE::HA::Resources->lookup($type);
+
+    if (!$plugin) {
+	die "service '$sid' has unknown resource type '$type'";
+    }
+
+    return $plugin->check_service_is_relocatable($haenv, $id, $node, $nonstrict);
+}
+
 # bash auto completion helper
 
 sub complete_sid {
diff --git a/src/test/test-locked-service2/log.expect b/src/test/test-locked-service2/log.expect
new file mode 100644
index 0000000..ab89adc
--- /dev/null
+++ b/src/test/test-locked-service2/log.expect
@@ -0,0 +1,39 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute service vm:103 lock snapshot
+info    220      cmdlist: execute network node3 off
+info    220    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    224    node3/crm: status change slave => wait_for_quorum
+info    225    node3/lrm: status change active => lost_agent_lock
+info    260    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info    260    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info    266     watchdog: execute power node3 off
+info    265    node3/crm: killed by poweroff
+info    266    node3/lrm: killed by poweroff
+info    266     hardware: server 'node3' stopped by poweroff (watchdog)
+info    340    node1/crm: got lock 'ha_agent_node3_lock'
+info    340    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info    340    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+err     340    node1/crm: service 'vm:103' not recoverable: service is locked with lock 'snapshot'
+info    340    node1/crm: service 'vm:103': state changed from 'fence' to 'error'
+info    820     hardware: exit simulation - done
-- 
2.1.4




More information about the pve-devel mailing list