[pve-devel] [PATCH ha-manager] TestHardware: correct shutdown/reboot behaviour of CRM and LRM
Thomas Lamprecht
t.lamprecht at proxmox.com
Mon Jan 18 10:26:45 CET 2016
Instead of shutting down the LRM and then killing the CRM we now
also make a shutdown request to the CRM, that mirrors the real world
behaviour much better and let's us also test the lock release from
the CRM.
To accomplish this we add new sim_hardware commands for stopping and
starting the CRM.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Sim/TestHardware.pm | 41 ++++++++++++++++++++++++-----
src/test/test-reboot1/log.expect | 13 +++++-----
src/test/test-shutdown1/log.expect | 29 +++++++++++----------
src/test/test-shutdown2/log.expect | 29 +++++++++++----------
src/test/test-shutdown3/log.expect | 29 +++++++++++----------
src/test/test-shutdown4/README | 5 ++++
src/test/test-shutdown4/cmdlist | 4 +++
src/test/test-shutdown4/hardware_status | 5 ++++
src/test/test-shutdown4/log.expect | 46 +++++++++++++++++++++++++++++++++
src/test/test-shutdown4/manager_status | 1 +
src/test/test-shutdown4/service_config | 3 +++
11 files changed, 151 insertions(+), 54 deletions(-)
create mode 100644 src/test/test-shutdown4/README
create mode 100644 src/test/test-shutdown4/cmdlist
create mode 100644 src/test/test-shutdown4/hardware_status
create mode 100644 src/test/test-shutdown4/log.expect
create mode 100644 src/test/test-shutdown4/manager_status
create mode 100644 src/test/test-shutdown4/service_config
diff --git a/src/PVE/HA/Sim/TestHardware.pm b/src/PVE/HA/Sim/TestHardware.pm
index d7f4efb..cfd48e7 100644
--- a/src/PVE/HA/Sim/TestHardware.pm
+++ b/src/PVE/HA/Sim/TestHardware.pm
@@ -160,6 +160,19 @@ sub sim_hardware_cmd {
$d->{lrm_restart} = 1;
$d->{lrm}->shutdown_request();
}
+ } elsif ($cmd eq 'crm') {
+
+ if ($action eq 'stop') {
+ if ($d->{crm}) {
+ $d->{crm_stop} = 1;
+ $d->{crm}->shutdown_request();
+ }
+ } elsif ($action eq 'start') {
+ $d->{crm} = PVE::HA::CRM->new($d->{crm_env}) if !$d->{crm};
+ } else {
+ die "sim_hardware_cmd: unknown action '$action'";
+ }
+
} elsif ($cmd eq 'service') {
if ($action eq 'enabled' || $action eq 'disabled') {
@@ -221,12 +234,30 @@ sub run {
$d->{crm_env}->loop_start_hook($self->get_time());
- die "implement me (CRM exit)" if !$crm->do_one_iteration();
+ my $exit_crm = !$crm->do_one_iteration();
$d->{crm_env}->loop_end_hook();
my $nodetime = $d->{crm_env}->get_time();
$self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
+
+ if ($exit_crm) {
+ $d->{crm_env}->log('info', "exit (loop end)");
+ $d->{crm} = undef;
+
+ my $cstatus = $self->read_hardware_status_nolock();
+ my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
+ my $shutdown = $nstatus->{shutdown} || '';
+ if ($shutdown eq 'reboot') {
+ $self->sim_hardware_cmd("power $node off", 'reboot');
+ $self->sim_hardware_cmd("power $node on", 'reboot');
+ } elsif ($shutdown eq 'shutdown') {
+ $self->sim_hardware_cmd("power $node off", 'shutdown');
+ } elsif (!$d->{crm_stop}) {
+ die "unexpected CRM exit - not implemented"
+ }
+ $d->{crm_stop} = undef;
+ }
}
if (my $lrm = $d->{lrm}) {
@@ -250,11 +281,9 @@ sub run {
die "lrm restart during shutdown - not implemented" if $shutdown;
$d->{lrm_restart} = undef;
$d->{lrm} = PVE::HA::LRM->new($d->{lrm_env});
- } elsif ($shutdown eq 'reboot') {
- $self->sim_hardware_cmd("power $node off", 'reboot');
- $self->sim_hardware_cmd("power $node on", 'reboot');
- } elsif ($shutdown eq 'shutdown') {
- $self->sim_hardware_cmd("power $node off", 'shutdown');
+ } elsif ($shutdown eq 'reboot' || $shutdown eq 'shutdown') {
+ # exit the LRM before the CRM to reflect real world behaviour
+ $self->sim_hardware_cmd("crm $node stop", $shutdown);
} else {
die "unexpected LRM exit - not implemented"
}
diff --git a/src/test/test-reboot1/log.expect b/src/test/test-reboot1/log.expect
index 12c3fe5..840f56d 100644
--- a/src/test/test-reboot1/log.expect
+++ b/src/test/test-reboot1/log.expect
@@ -25,14 +25,15 @@ info 120 node3/lrm: shutdown LRM, stop all services
info 125 node3/lrm: stopping service vm:103
info 125 node3/lrm: service status vm:103 stopped
info 126 node3/lrm: exit (loop end)
-info 126 reboot: execute power node3 off
-info 125 node3/crm: killed by poweroff
-info 126 reboot: execute power node3 on
-info 125 node3/crm: status change startup => wait_for_quorum
-info 126 node3/lrm: status change startup => wait_for_agent_lock
-info 144 node3/crm: status change wait_for_quorum => slave
+info 126 reboot: execute crm node3 stop
+info 145 node3/crm: exit (loop end)
+info 145 reboot: execute power node3 off
+info 145 reboot: execute power node3 on
+info 145 node3/crm: status change startup => wait_for_quorum
+info 140 node3/lrm: status change startup => wait_for_agent_lock
info 145 node3/lrm: got lock 'ha_agent_node3_lock'
info 145 node3/lrm: status change wait_for_agent_lock => active
info 145 node3/lrm: starting service vm:103
info 145 node3/lrm: service status vm:103 started
+info 164 node3/crm: status change wait_for_quorum => slave
info 720 hardware: exit simulation - done
diff --git a/src/test/test-shutdown1/log.expect b/src/test/test-shutdown1/log.expect
index 5c063ab..76f5133 100644
--- a/src/test/test-shutdown1/log.expect
+++ b/src/test/test-shutdown1/log.expect
@@ -25,18 +25,19 @@ info 120 node3/lrm: shutdown LRM, stop all services
info 125 node3/lrm: stopping service vm:103
info 125 node3/lrm: service status vm:103 stopped
info 126 node3/lrm: exit (loop end)
-info 126 shutdown: execute power node3 off
-info 125 node3/crm: killed by poweroff
-info 140 node1/crm: node 'node3': state changed from 'online' => 'unknown'
-info 180 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
-info 180 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
-info 180 node1/crm: got lock 'ha_agent_node3_lock'
-info 180 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
-info 180 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info 180 node1/crm: service 'vm:103': state changed from 'fence' to 'stopped'
-info 180 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node1)
-info 181 node1/lrm: got lock 'ha_agent_node1_lock'
-info 181 node1/lrm: status change wait_for_agent_lock => active
-info 181 node1/lrm: starting service vm:103
-info 181 node1/lrm: service status vm:103 started
+info 126 shutdown: execute crm node3 stop
+info 145 node3/crm: exit (loop end)
+info 145 shutdown: execute power node3 off
+info 160 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 200 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info 200 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info 200 node1/crm: got lock 'ha_agent_node3_lock'
+info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'stopped'
+info 200 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node1)
+info 201 node1/lrm: got lock 'ha_agent_node1_lock'
+info 201 node1/lrm: status change wait_for_agent_lock => active
+info 201 node1/lrm: starting service vm:103
+info 201 node1/lrm: service status vm:103 started
info 720 hardware: exit simulation - done
diff --git a/src/test/test-shutdown2/log.expect b/src/test/test-shutdown2/log.expect
index b367b64..4b90294 100644
--- a/src/test/test-shutdown2/log.expect
+++ b/src/test/test-shutdown2/log.expect
@@ -25,20 +25,21 @@ info 120 node3/lrm: shutdown LRM, stop all services
info 125 node3/lrm: stopping service vm:103
info 125 node3/lrm: service status vm:103 stopped
info 126 node3/lrm: exit (loop end)
-info 126 shutdown: execute power node3 off
-info 125 node3/crm: killed by poweroff
-info 140 node1/crm: node 'node3': state changed from 'online' => 'unknown'
-info 180 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
-info 180 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
-info 180 node1/crm: got lock 'ha_agent_node3_lock'
-info 180 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
-info 180 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info 180 node1/crm: service 'vm:103': state changed from 'fence' to 'stopped'
-info 180 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node1)
-info 181 node1/lrm: got lock 'ha_agent_node1_lock'
-info 181 node1/lrm: status change wait_for_agent_lock => active
-info 181 node1/lrm: starting service vm:103
-info 181 node1/lrm: service status vm:103 started
+info 126 shutdown: execute crm node3 stop
+info 145 node3/crm: exit (loop end)
+info 145 shutdown: execute power node3 off
+info 160 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 200 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info 200 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info 200 node1/crm: got lock 'ha_agent_node3_lock'
+info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'stopped'
+info 200 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node1)
+info 201 node1/lrm: got lock 'ha_agent_node1_lock'
+info 201 node1/lrm: status change wait_for_agent_lock => active
+info 201 node1/lrm: starting service vm:103
+info 201 node1/lrm: service status vm:103 started
info 500 cmdlist: execute power node3 on
info 500 node3/crm: status change startup => wait_for_quorum
info 500 node3/lrm: status change startup => wait_for_agent_lock
diff --git a/src/test/test-shutdown3/log.expect b/src/test/test-shutdown3/log.expect
index 559cb4f..8ceb042 100644
--- a/src/test/test-shutdown3/log.expect
+++ b/src/test/test-shutdown3/log.expect
@@ -25,20 +25,21 @@ info 120 node3/lrm: shutdown LRM, stop all services
info 125 node3/lrm: stopping service ct:103
info 125 node3/lrm: service status ct:103 stopped
info 126 node3/lrm: exit (loop end)
-info 126 shutdown: execute power node3 off
-info 125 node3/crm: killed by poweroff
-info 140 node1/crm: node 'node3': state changed from 'online' => 'unknown'
-info 180 node1/crm: service 'ct:103': state changed from 'started' to 'fence'
-info 180 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
-info 180 node1/crm: got lock 'ha_agent_node3_lock'
-info 180 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
-info 180 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info 180 node1/crm: service 'ct:103': state changed from 'fence' to 'stopped'
-info 180 node1/crm: service 'ct:103': state changed from 'stopped' to 'started' (node = node1)
-info 181 node1/lrm: got lock 'ha_agent_node1_lock'
-info 181 node1/lrm: status change wait_for_agent_lock => active
-info 181 node1/lrm: starting service ct:103
-info 181 node1/lrm: service status ct:103 started
+info 126 shutdown: execute crm node3 stop
+info 145 node3/crm: exit (loop end)
+info 145 shutdown: execute power node3 off
+info 160 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 200 node1/crm: service 'ct:103': state changed from 'started' to 'fence'
+info 200 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info 200 node1/crm: got lock 'ha_agent_node3_lock'
+info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info 200 node1/crm: service 'ct:103': state changed from 'fence' to 'stopped'
+info 200 node1/crm: service 'ct:103': state changed from 'stopped' to 'started' (node = node1)
+info 201 node1/lrm: got lock 'ha_agent_node1_lock'
+info 201 node1/lrm: status change wait_for_agent_lock => active
+info 201 node1/lrm: starting service ct:103
+info 201 node1/lrm: service status ct:103 started
info 500 cmdlist: execute power node3 on
info 500 node3/crm: status change startup => wait_for_quorum
info 500 node3/lrm: status change startup => wait_for_agent_lock
diff --git a/src/test/test-shutdown4/README b/src/test/test-shutdown4/README
new file mode 100644
index 0000000..0c5fe02
--- /dev/null
+++ b/src/test/test-shutdown4/README
@@ -0,0 +1,5 @@
+This tests if the manager lock gets released AND the services from the node with
+the manager lock get cleanly shutdown without changing the state of the service
+in the cluster.
+That means that the powered off node gets fenced by the new master and the
+service will be relocated and started again.
diff --git a/src/test/test-shutdown4/cmdlist b/src/test/test-shutdown4/cmdlist
new file mode 100644
index 0000000..e84297f
--- /dev/null
+++ b/src/test/test-shutdown4/cmdlist
@@ -0,0 +1,4 @@
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "shutdown node1" ]
+]
diff --git a/src/test/test-shutdown4/hardware_status b/src/test/test-shutdown4/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-shutdown4/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-shutdown4/log.expect b/src/test/test-shutdown4/log.expect
new file mode 100644
index 0000000..c5564cc
--- /dev/null
+++ b/src/test/test-shutdown4/log.expect
@@ -0,0 +1,46 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:100' on node 'node1'
+info 21 node1/lrm: got lock 'ha_agent_node1_lock'
+info 21 node1/lrm: status change wait_for_agent_lock => active
+info 21 node1/lrm: starting service vm:100
+info 21 node1/lrm: service status vm:100 started
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 120 cmdlist: execute shutdown node1
+info 120 node1/lrm: shutdown LRM, stop all services
+info 121 node1/lrm: stopping service vm:100
+info 121 node1/lrm: service status vm:100 stopped
+info 122 node1/lrm: exit (loop end)
+info 122 shutdown: execute crm node1 stop
+info 140 node1/crm: voluntary release CRM lock
+info 141 node1/crm: exit (loop end)
+info 141 shutdown: execute power node1 off
+info 141 node2/crm: got lock 'ha_manager_lock'
+info 141 node2/crm: status change slave => master
+info 141 node2/crm: node 'node1': state changed from 'online' => 'unknown'
+info 220 node2/crm: service 'vm:100': state changed from 'started' to 'fence'
+info 220 node2/crm: node 'node1': state changed from 'unknown' => 'fence'
+info 220 node2/crm: got lock 'ha_agent_node1_lock'
+info 220 node2/crm: fencing: acknowleged - got agent lock for node 'node1'
+info 220 node2/crm: node 'node1': state changed from 'fence' => 'unknown'
+info 220 node2/crm: service 'vm:100': state changed from 'fence' to 'stopped'
+info 220 node2/crm: service 'vm:100': state changed from 'stopped' to 'started' (node = node2)
+info 221 node2/lrm: got lock 'ha_agent_node2_lock'
+info 221 node2/lrm: status change wait_for_agent_lock => active
+info 221 node2/lrm: starting service vm:100
+info 221 node2/lrm: service status vm:100 started
+info 720 hardware: exit simulation - done
diff --git a/src/test/test-shutdown4/manager_status b/src/test/test-shutdown4/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-shutdown4/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-shutdown4/service_config b/src/test/test-shutdown4/service_config
new file mode 100644
index 0000000..01d6242
--- /dev/null
+++ b/src/test/test-shutdown4/service_config
@@ -0,0 +1,3 @@
+{
+ "vm:100": { "node": "node1", "state": "enabled" }
+}
--
2.1.4
More information about the pve-devel
mailing list