[pve-devel] [PATCH ha-manager 2/3] fix #1378: allow to specify a service shutdown policy

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Dec 19 11:39:40 CET 2018


Allow an admin to set a datacenter wide HA policy which can change
the way we handle services on a node shutdown.

There's:

* freeze: always freeze servivces, independent of the shutdown type
  (reboot, poweroff)
* failover: never freeze services, this means that a service will get
  recovered to another node if possible and if the current node does
  not comes back up in the grace period of 1 minute.
* default: this is the current behavior, freeze on reboot but do not
  freeze on poweroff

Add to tests, shutdown-policy1 which is based of the reboot1 test,
but enforces no freeze with a failover policy, and shutdown-policy2
which is based on the shutdown1 test but with a explicit freeze
policy. You can compare (diff) each tests log result to the test it's
based on to see what changes.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/LRM.pm                             | 22 +++++++++-
 src/test/test-shutdown-policy1/cmdlist        |  4 ++
 src/test/test-shutdown-policy1/datacenter.cfg |  5 +++
 .../test-shutdown-policy1/hardware_status     |  5 +++
 src/test/test-shutdown-policy1/log.expect     | 40 +++++++++++++++++++
 src/test/test-shutdown-policy1/manager_status |  1 +
 src/test/test-shutdown-policy1/service_config |  3 ++
 src/test/test-shutdown-policy2/cmdlist        |  4 ++
 src/test/test-shutdown-policy2/datacenter.cfg |  5 +++
 .../test-shutdown-policy2/hardware_status     |  5 +++
 src/test/test-shutdown-policy2/log.expect     | 34 ++++++++++++++++
 src/test/test-shutdown-policy2/manager_status |  1 +
 src/test/test-shutdown-policy2/service_config |  3 ++
 13 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 src/test/test-shutdown-policy1/cmdlist
 create mode 100644 src/test/test-shutdown-policy1/datacenter.cfg
 create mode 100644 src/test/test-shutdown-policy1/hardware_status
 create mode 100644 src/test/test-shutdown-policy1/log.expect
 create mode 100644 src/test/test-shutdown-policy1/manager_status
 create mode 100644 src/test/test-shutdown-policy1/service_config
 create mode 100644 src/test/test-shutdown-policy2/cmdlist
 create mode 100644 src/test/test-shutdown-policy2/datacenter.cfg
 create mode 100644 src/test/test-shutdown-policy2/hardware_status
 create mode 100644 src/test/test-shutdown-policy2/log.expect
 create mode 100644 src/test/test-shutdown-policy2/manager_status
 create mode 100644 src/test/test-shutdown-policy2/service_config

diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index dda82eb..fc560fa 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -53,6 +53,20 @@ sub shutdown_request {
 
     my ($shutdown, $reboot) = $haenv->is_node_shutdown();
 
+    my $dc_ha_cfg = $haenv->get_ha_settings();
+    my $shutdown_policy = $dc_ha_cfg->{shutdown_policy} // 'default';
+
+    my $freeze_all = $reboot;
+    if ($shutdown_policy eq 'default') {
+	$freeze_all = $reboot;
+    } elsif ($shutdown_policy eq 'freeze') {
+	$freeze_all = 1;
+    } elsif ($shutdown_policy eq 'failover') {
+	$freeze_all = 0;
+    } else {
+	$haenv->log('err', "unkown shutdown policy '$shutdown_policy', fall back to default");
+    }
+
     if ($shutdown) {
 	# *always* queue stop jobs for all services if the node shuts down,
 	# independent if it's a reboot or a poweroff, else we may corrupt
@@ -69,8 +83,12 @@ sub shutdown_request {
     }
 
     if ($shutdown) {
-	if ($reboot) {
-	    $haenv->log('info', "reboot LRM, stop and freeze all services");
+	if ($freeze_all) {
+	    if ($shutdown_policy eq 'default') {
+		$haenv->log('info', "reboot LRM, stop and freeze all services");
+	    } else {
+		$haenv->log('info', "shutdown LRM, stop and freeze all services");
+	    }
 	    $self->{mode} = 'restart';
 	} else {
 	    $haenv->log('info', "shutdown LRM, stop all services");
diff --git a/src/test/test-shutdown-policy1/cmdlist b/src/test/test-shutdown-policy1/cmdlist
new file mode 100644
index 0000000..8558351
--- /dev/null
+++ b/src/test/test-shutdown-policy1/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "reboot node3" ]
+]
diff --git a/src/test/test-shutdown-policy1/datacenter.cfg b/src/test/test-shutdown-policy1/datacenter.cfg
new file mode 100644
index 0000000..6108ece
--- /dev/null
+++ b/src/test/test-shutdown-policy1/datacenter.cfg
@@ -0,0 +1,5 @@
+{
+    "ha": {
+        "shutdown_policy": "failover"
+    }
+}
diff --git a/src/test/test-shutdown-policy1/hardware_status b/src/test/test-shutdown-policy1/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-shutdown-policy1/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-shutdown-policy1/log.expect b/src/test/test-shutdown-policy1/log.expect
new file mode 100644
index 0000000..385b07a
--- /dev/null
+++ b/src/test/test-shutdown-policy1/log.expect
@@ -0,0 +1,40 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute reboot node3
+info    120    node3/lrm: shutdown LRM, stop all services
+info    125    node3/lrm: stopping service vm:103
+info    125    node3/lrm: service status vm:103 stopped
+info    126    node3/lrm: exit (loop end)
+info    126       reboot: execute crm node3 stop
+info    125    node3/crm: server received shutdown request
+info    145    node3/crm: exit (loop end)
+info    145       reboot: execute power node3 off
+info    145       reboot: execute power node3 on
+info    145    node3/crm: status change startup => wait_for_quorum
+info    140    node3/lrm: status change startup => wait_for_agent_lock
+info    145    node3/lrm: got lock 'ha_agent_node3_lock'
+info    145    node3/lrm: status change wait_for_agent_lock => active
+info    145    node3/lrm: starting service vm:103
+info    145    node3/lrm: service status vm:103 started
+info    164    node3/crm: status change wait_for_quorum => slave
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-shutdown-policy1/manager_status b/src/test/test-shutdown-policy1/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-shutdown-policy1/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-shutdown-policy1/service_config b/src/test/test-shutdown-policy1/service_config
new file mode 100644
index 0000000..c6860e7
--- /dev/null
+++ b/src/test/test-shutdown-policy1/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
diff --git a/src/test/test-shutdown-policy2/cmdlist b/src/test/test-shutdown-policy2/cmdlist
new file mode 100644
index 0000000..a86b9e2
--- /dev/null
+++ b/src/test/test-shutdown-policy2/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "shutdown node3" ]
+]
diff --git a/src/test/test-shutdown-policy2/datacenter.cfg b/src/test/test-shutdown-policy2/datacenter.cfg
new file mode 100644
index 0000000..0d411c1
--- /dev/null
+++ b/src/test/test-shutdown-policy2/datacenter.cfg
@@ -0,0 +1,5 @@
+{
+    "ha": {
+        "shutdown_policy": "freeze"
+    }
+}
diff --git a/src/test/test-shutdown-policy2/hardware_status b/src/test/test-shutdown-policy2/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-shutdown-policy2/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-shutdown-policy2/log.expect b/src/test/test-shutdown-policy2/log.expect
new file mode 100644
index 0000000..a36c628
--- /dev/null
+++ b/src/test/test-shutdown-policy2/log.expect
@@ -0,0 +1,34 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute shutdown node3
+info    120    node3/lrm: shutdown LRM, stop and freeze all services
+info    120    node1/crm: service 'vm:103': state changed from 'started' to 'freeze'
+info    125    node3/lrm: stopping service vm:103
+info    125    node3/lrm: service status vm:103 stopped
+info    126    node3/lrm: exit (loop end)
+info    126     shutdown: execute crm node3 stop
+info    125    node3/crm: server received shutdown request
+info    145    node3/crm: exit (loop end)
+info    145     shutdown: execute power node3 off
+info    160    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-shutdown-policy2/manager_status b/src/test/test-shutdown-policy2/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-shutdown-policy2/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-shutdown-policy2/service_config b/src/test/test-shutdown-policy2/service_config
new file mode 100644
index 0000000..c6860e7
--- /dev/null
+++ b/src/test/test-shutdown-policy2/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
-- 
2.19.2





More information about the pve-devel mailing list