[pve-devel] [PATCH ha-manager v3 5/6] add check if a service is relocatable and assert it on recovery
Thomas Lamprecht
t.lamprecht at proxmox.com
Wed Sep 14 11:29:44 CEST 2016
Add a basic check if a service is relocatable, i.e. is bound to
local resources from a node.
Assert this check when a service gets recovered after a failed node
to avoid moving the service only so that it fails to start, even if
we knew beforehand that it cannot start.
Place such services in the error state as they need manual
intervention.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Manager.pm | 7 ++++++
src/PVE/HA/Resources.pm | 7 ++++++
src/PVE/HA/Resources/PVECT.pm | 17 ++++++++++++++
src/PVE/HA/Resources/PVEVM.pm | 20 ++++++++++++++++
src/PVE/HA/Sim/Resources.pm | 17 ++++++++++++++
src/PVE/HA/Tools.pm | 16 +++++++++++++
src/test/test-locked-service2/log.expect | 39 ++++++++++++++++++++++++++++++++
7 files changed, 123 insertions(+)
create mode 100644 src/test/test-locked-service2/log.expect
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index c60df7c..2d1013a 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -261,6 +261,13 @@ my $recover_fenced_service = sub {
$cd, $sd->{node});
if ($recovery_node) {
+ eval { PVE::HA::Tools::check_service_is_relocatable($sid, $haenv, $fenced_node, 1); };
+ if (my $err = $@) {
+ $haenv->log('err', "service '$sid' not recoverable: $err");
+ &$change_service_state($self, $sid, 'error');
+ return;
+ }
+
$haenv->log('info', "recover service '$sid' from fenced node " .
"'$fenced_node' to node '$recovery_node'");
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 3836fc8..fea65c7 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -124,6 +124,13 @@ sub check_running {
die "implement in subclass";
}
+sub check_service_is_relocatable {
+ my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+ die "implement in subclass";
+}
+
+
# package PVE::HA::Resources::IPAddr;
diff --git a/src/PVE/HA/Resources/PVECT.pm b/src/PVE/HA/Resources/PVECT.pm
index b6ebe2f..fa0237b 100644
--- a/src/PVE/HA/Resources/PVECT.pm
+++ b/src/PVE/HA/Resources/PVECT.pm
@@ -114,4 +114,21 @@ sub check_running {
return PVE::LXC::check_running($vmid);
}
+sub check_service_is_relocatable {
+ my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+ my $conf = PVE::LXC::Config->load_config($id, $service_node);
+
+ # check for blocking locks, when doing recovery allow safe-to-delete locks
+ my $lock = $conf->{lock};
+ if ($lock && !($nonstrict && ($lock eq 'backup' || $lock eq 'mounted'))) {
+ die "service is locked with lock '$lock'\n" if !$noerr;
+ return undef;
+ }
+
+ # TODO: check more (e.g. storage availability)
+
+ return 1;
+}
+
1;
diff --git a/src/PVE/HA/Resources/PVEVM.pm b/src/PVE/HA/Resources/PVEVM.pm
index 4c06df9..e3e8c55 100644
--- a/src/PVE/HA/Resources/PVEVM.pm
+++ b/src/PVE/HA/Resources/PVEVM.pm
@@ -116,4 +116,24 @@ sub check_running {
return PVE::QemuServer::check_running($vmid, 1, $nodename);
}
+sub check_service_is_relocatable {
+ my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+ my $conf = PVE::QemuConfig->load_config($id, $service_node);
+
+ # check for blocking locks, when doing recovery allow safe-to-delete locks
+ my $lock = $conf->{lock};
+ if ($lock && !($nonstrict && $lock eq 'backup')) {
+ die "service is locked with lock '$lock'\n" if !$noerr;
+ return undef;
+ }
+
+ # tell method to die if any local resources are in use
+ return undef if !PVE::QemuServer::check_local_resources($conf, $noerr);
+
+ # TODO: check more (e.g. storage availability)
+
+ return 1;
+}
+
1;
diff --git a/src/PVE/HA/Sim/Resources.pm b/src/PVE/HA/Sim/Resources.pm
index fe82332..d34e2ea 100644
--- a/src/PVE/HA/Sim/Resources.pm
+++ b/src/PVE/HA/Sim/Resources.pm
@@ -123,5 +123,22 @@ sub migrate {
return defined($ss->{$sid}) ? 0 : 1;
}
+sub check_service_is_relocatable {
+ my ($self, $haenv, $id, $service_node, $nonstrict, $noerr) = @_;
+
+ my $sid = $self->type() . ":$id";
+ my $hardware = $haenv->hardware();
+
+ my $conf = $hardware->read_service_config();
+
+ # check for blocking locks, when doing recovery allow safe-to-delete locks
+ my $lock = $conf->{$sid}->{lock};
+ if ($lock && !($nonstrict && ($lock eq 'backup' || $lock eq 'mounted'))) {
+ die "service is locked with lock '$lock'\n" if !$noerr;
+ return undef;
+ }
+
+ return 1;
+}
1;
diff --git a/src/PVE/HA/Tools.pm b/src/PVE/HA/Tools.pm
index 072ef8f..3e72e3e 100644
--- a/src/PVE/HA/Tools.pm
+++ b/src/PVE/HA/Tools.pm
@@ -193,6 +193,22 @@ sub upid_wait {
PVE::ProcFSTools::upid_wait($upid, $waitfunc, 5);
}
+# checks if a service can be relocated, i.e. if its not bound to a node
+# setting the 'nonstrict' parameter allows some of the checks to fail, e.g.
+# during recovery certain service types may have still a backup left over
+sub check_service_is_relocatable {
+ my ($sid, $haenv, $node, $nonstrict) = @_;
+
+ my (undef, $type, $id) = PVE::HA::Tools::parse_sid($sid);
+ my $plugin = PVE::HA::Resources->lookup($type);
+
+ if (!$plugin) {
+ die "service '$sid' has unknown resource type '$type'";
+ }
+
+ return $plugin->check_service_is_relocatable($haenv, $id, $node, $nonstrict);
+}
+
# bash auto completion helper
sub complete_sid {
diff --git a/src/test/test-locked-service2/log.expect b/src/test/test-locked-service2/log.expect
new file mode 100644
index 0000000..ab89adc
--- /dev/null
+++ b/src/test/test-locked-service2/log.expect
@@ -0,0 +1,39 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 120 cmdlist: execute service vm:103 lock snapshot
+info 220 cmdlist: execute network node3 off
+info 220 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 224 node3/crm: status change slave => wait_for_quorum
+info 225 node3/lrm: status change active => lost_agent_lock
+info 260 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info 260 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info 266 watchdog: execute power node3 off
+info 265 node3/crm: killed by poweroff
+info 266 node3/lrm: killed by poweroff
+info 266 hardware: server 'node3' stopped by poweroff (watchdog)
+info 340 node1/crm: got lock 'ha_agent_node3_lock'
+info 340 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+err 340 node1/crm: service 'vm:103' not recoverable: service is locked with lock 'snapshot'
+info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'error'
+info 820 hardware: exit simulation - done
--
2.1.4
More information about the pve-devel
mailing list