[pve-devel] [PATCH ha-manager 3/3] add VirtFail resource and use it in new regression tests
Thomas Lamprecht
t.lamprecht at proxmox.com
Wed Feb 10 14:13:44 CET 2016
This resource let us test a defined failiure behaviour ofi services.
Through the VMID we define how it should behave, with the folowing
rules:
When the service has the SID "fa:abcde" the digits a - e mean:
a - no meaning but can be used for differentiating similar resources
b - how many tries are needed to start correctly (0=default)
c - how many tries are needed to migrate correctly (0=default)
d - should shutdown be successful (0 = yes, anything else no)
e - return value of $plugin->exists() defaults to 1 if not set
a,b,c should always be set even if b and c have defaults (makes test
purpose clearer)
d and e
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
Checkout the READMEs of the test fopr a short description of what
they cover.
src/PVE/HA/Sim/Env.pm | 2 +
src/PVE/HA/Sim/Hardware.pm | 4 +-
src/PVE/HA/Sim/Resources/VirtFail.pm | 93 +++++++++++++++++++++++++
src/test/test-resource-failure1/README | 2 +
src/test/test-resource-failure1/cmdlist | 4 ++
src/test/test-resource-failure1/hardware_status | 5 ++
src/test/test-resource-failure1/log.expect | 30 ++++++++
src/test/test-resource-failure1/manager_status | 1 +
src/test/test-resource-failure1/service_config | 3 +
src/test/test-resource-failure2/README | 4 ++
src/test/test-resource-failure2/cmdlist | 4 ++
src/test/test-resource-failure2/groups | 2 +
src/test/test-resource-failure2/hardware_status | 5 ++
src/test/test-resource-failure2/log.expect | 44 ++++++++++++
src/test/test-resource-failure2/manager_status | 1 +
src/test/test-resource-failure2/service_config | 3 +
src/test/test-resource-failure3/README | 3 +
src/test/test-resource-failure3/cmdlist | 4 ++
src/test/test-resource-failure3/hardware_status | 5 ++
src/test/test-resource-failure3/log.expect | 30 ++++++++
src/test/test-resource-failure3/manager_status | 1 +
src/test/test-resource-failure3/service_config | 3 +
src/test/test-resource-failure4/README | 3 +
src/test/test-resource-failure4/cmdlist | 5 ++
src/test/test-resource-failure4/hardware_status | 5 ++
src/test/test-resource-failure4/log.expect | 42 +++++++++++
src/test/test-resource-failure4/manager_status | 1 +
src/test/test-resource-failure4/service_config | 3 +
src/test/test-resource-failure5/README | 4 ++
src/test/test-resource-failure5/cmdlist | 5 ++
src/test/test-resource-failure5/hardware_status | 5 ++
src/test/test-resource-failure5/log.expect | 38 ++++++++++
src/test/test-resource-failure5/manager_status | 1 +
src/test/test-resource-failure5/service_config | 3 +
34 files changed, 367 insertions(+), 1 deletion(-)
create mode 100644 src/PVE/HA/Sim/Resources/VirtFail.pm
create mode 100644 src/test/test-resource-failure1/README
create mode 100644 src/test/test-resource-failure1/cmdlist
create mode 100644 src/test/test-resource-failure1/hardware_status
create mode 100644 src/test/test-resource-failure1/log.expect
create mode 100644 src/test/test-resource-failure1/manager_status
create mode 100644 src/test/test-resource-failure1/service_config
create mode 100644 src/test/test-resource-failure2/README
create mode 100644 src/test/test-resource-failure2/cmdlist
create mode 100644 src/test/test-resource-failure2/groups
create mode 100644 src/test/test-resource-failure2/hardware_status
create mode 100644 src/test/test-resource-failure2/log.expect
create mode 100644 src/test/test-resource-failure2/manager_status
create mode 100644 src/test/test-resource-failure2/service_config
create mode 100644 src/test/test-resource-failure3/README
create mode 100644 src/test/test-resource-failure3/cmdlist
create mode 100644 src/test/test-resource-failure3/hardware_status
create mode 100644 src/test/test-resource-failure3/log.expect
create mode 100644 src/test/test-resource-failure3/manager_status
create mode 100644 src/test/test-resource-failure3/service_config
create mode 100644 src/test/test-resource-failure4/README
create mode 100644 src/test/test-resource-failure4/cmdlist
create mode 100644 src/test/test-resource-failure4/hardware_status
create mode 100644 src/test/test-resource-failure4/log.expect
create mode 100644 src/test/test-resource-failure4/manager_status
create mode 100644 src/test/test-resource-failure4/service_config
create mode 100644 src/test/test-resource-failure5/README
create mode 100644 src/test/test-resource-failure5/cmdlist
create mode 100644 src/test/test-resource-failure5/hardware_status
create mode 100644 src/test/test-resource-failure5/log.expect
create mode 100644 src/test/test-resource-failure5/manager_status
create mode 100644 src/test/test-resource-failure5/service_config
diff --git a/src/PVE/HA/Sim/Env.pm b/src/PVE/HA/Sim/Env.pm
index e154988..1978ebc 100644
--- a/src/PVE/HA/Sim/Env.pm
+++ b/src/PVE/HA/Sim/Env.pm
@@ -13,9 +13,11 @@ use PVE::HA::Env;
use PVE::HA::Resources;
use PVE::HA::Sim::Resources::VirtVM;
use PVE::HA::Sim::Resources::VirtCT;
+use PVE::HA::Sim::Resources::VirtFail;
PVE::HA::Sim::Resources::VirtVM->register();
PVE::HA::Sim::Resources::VirtCT->register();
+PVE::HA::Sim::Resources::VirtFail->register();
PVE::HA::Resources->init();
diff --git a/src/PVE/HA/Sim/Hardware.pm b/src/PVE/HA/Sim/Hardware.pm
index 2cbe64d..652f11d 100644
--- a/src/PVE/HA/Sim/Hardware.pm
+++ b/src/PVE/HA/Sim/Hardware.pm
@@ -93,13 +93,15 @@ sub read_service_config {
die "service '$sid' without assigned node!" if !$d->{node};
- if ($sid =~ m/^(vm|ct):(\d+)$/) {
+ if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
$d->{type} = $1;
$d->{name} = $2;
} else {
die "implement me";
}
$d->{state} = 'disabled' if !$d->{state};
+ $d->{max_restart} = 1 if !defined($d->{max_restart});
+ $d->{max_relocate} = 1 if !defined($d->{max_relocate});
}
return $conf;
diff --git a/src/PVE/HA/Sim/Resources/VirtFail.pm b/src/PVE/HA/Sim/Resources/VirtFail.pm
new file mode 100644
index 0000000..e421f29
--- /dev/null
+++ b/src/PVE/HA/Sim/Resources/VirtFail.pm
@@ -0,0 +1,93 @@
+package PVE::HA::Sim::Resources::VirtFail;
+
+use strict;
+use warnings;
+
+use base qw(PVE::HA::Sim::Resources);
+
+# This class lets us simulate failing resources for the regression tests
+# To make it more intresting we can encode some bahviour in the VMID
+# with the following format, where fa: is the type and a, b, c, ...
+# are ciffers in base 10
+# fa:abcde
+# meaning:
+# a - no meaning but can be used for differentiating similar resources
+# b - how many tries are needed to start correctly (0 is normal behaviour) (should be set)
+# c - how many tries are needed to migrate correctly (0 is normal behaviour) (should be set)
+# d - should shutdown be successful (0 = yes, anything else no) (optional)
+# e - return value of $plugin->exists() defaults to 1 if not set (optional)
+
+my $decode_id = sub {
+ my $id = shift;
+
+ my ($start, $migrate, $stop, $exists) = $id =~ /^\d(\d)(\d)(\d)?(\d)?/g;
+
+ $start = 0 if !defined($start);
+ $migrate = 0 if !defined($migrate);
+ $stop = 0 if !defined($stop);
+ $exists = 1 if !defined($exists);
+
+ return ($start, $migrate, $stop, $exists)
+};
+
+my $tries = {
+ start => {},
+ migrate => {},
+};
+
+
+sub type {
+ return 'fa';
+}
+
+sub exists {
+ my ($class, $id, $noerr) = @_;
+
+ my (undef, undef, undef, $exists) = &$decode_id($id);
+ print $exists ."\n";
+
+ return $exists;
+}
+
+sub start {
+ my ($class, $haenv, $id) = @_;
+
+ my ($start_failure_count) = &$decode_id($id);
+
+ $tries->{start}->{$id} = 0 if !$tries->{start}->{$id};
+ $tries->{start}->{$id}++;
+
+ return if $start_failure_count >= $tries->{start}->{$id};
+
+ $tries->{start}->{$id} = 0; # reset counts
+
+ return $class->SUPER::start($haenv, $id);
+
+}
+
+sub shutdown {
+ my ($class, $haenv, $id) = @_;
+
+ my (undef, undef, $cannot_stop) = &$decode_id($id);
+
+ return if $cannot_stop;
+
+ return $class->SUPER::shutdown($haenv, $id);
+}
+
+sub migrate {
+ my ($class, $haenv, $id, $target, $online) = @_;
+
+ my (undef, $migrate_failure_count) = &$decode_id($id);
+
+ $tries->{migrate}->{$id} = 0 if !$tries->{migrate}->{$id};
+ $tries->{migrate}->{$id}++;
+
+ return if $migrate_failure_count >= $tries->{migrate}->{$id};
+
+ $tries->{migrate}->{$id} = 0; # reset counts
+
+ return $class->SUPER::migrate($haenv, $id, $target, $online);
+
+}
+1;
diff --git a/src/test/test-resource-failure1/README b/src/test/test-resource-failure1/README
new file mode 100644
index 0000000..e1a6dc9
--- /dev/null
+++ b/src/test/test-resource-failure1/README
@@ -0,0 +1,2 @@
+Test restart policy if service fails one time.
+The LRM should try again to start it and succeed.
diff --git a/src/test/test-resource-failure1/cmdlist b/src/test/test-resource-failure1/cmdlist
new file mode 100644
index 0000000..5489743
--- /dev/null
+++ b/src/test/test-resource-failure1/cmdlist
@@ -0,0 +1,4 @@
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service fa:110 enabled" ]
+]
diff --git a/src/test/test-resource-failure1/hardware_status b/src/test/test-resource-failure1/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure1/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure1/log.expect b/src/test/test-resource-failure1/log.expect
new file mode 100644
index 0000000..97d9fdb
--- /dev/null
+++ b/src/test/test-resource-failure1/log.expect
@@ -0,0 +1,30 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'fa:110' on node 'node2'
+info 20 node1/crm: service 'fa:110': state changed from 'started' to 'request_stop'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 23 node2/lrm: got lock 'ha_agent_node2_lock'
+info 23 node2/lrm: status change wait_for_agent_lock => active
+info 24 node3/crm: status change wait_for_quorum => slave
+info 40 node1/crm: service 'fa:110': state changed from 'request_stop' to 'stopped'
+info 120 cmdlist: execute service fa:110 enabled
+info 120 node1/crm: service 'fa:110': state changed from 'stopped' to 'started' (node = node2)
+info 123 node2/lrm: starting service fa:110
+warn 123 node2/lrm: unable to start service fa:110
+warn 123 node2/lrm: restart policy: retry number 1 for service 'fa:110'
+info 143 node2/lrm: starting service fa:110
+info 143 node2/lrm: service status fa:110 started
+info 720 hardware: exit simulation - done
diff --git a/src/test/test-resource-failure1/manager_status b/src/test/test-resource-failure1/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure1/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure1/service_config b/src/test/test-resource-failure1/service_config
new file mode 100644
index 0000000..cf17020
--- /dev/null
+++ b/src/test/test-resource-failure1/service_config
@@ -0,0 +1,3 @@
+{
+ "fa:110": { "node": "node2" }
+}
diff --git a/src/test/test-resource-failure2/README b/src/test/test-resource-failure2/README
new file mode 100644
index 0000000..b9a0340
--- /dev/null
+++ b/src/test/test-resource-failure2/README
@@ -0,0 +1,4 @@
+Test restart in combination with relocate policy.
+Service 'fa:130' fails three times to restart and has a
+'max_restart' policy of 2. So after the second time it
+should be relocated to another node an start there succesfully
diff --git a/src/test/test-resource-failure2/cmdlist b/src/test/test-resource-failure2/cmdlist
new file mode 100644
index 0000000..8f06508
--- /dev/null
+++ b/src/test/test-resource-failure2/cmdlist
@@ -0,0 +1,4 @@
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service fa:130 enabled" ]
+]
diff --git a/src/test/test-resource-failure2/groups b/src/test/test-resource-failure2/groups
new file mode 100644
index 0000000..01d634f
--- /dev/null
+++ b/src/test/test-resource-failure2/groups
@@ -0,0 +1,2 @@
+group: all
+ nodes node1,node2,node3
diff --git a/src/test/test-resource-failure2/hardware_status b/src/test/test-resource-failure2/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure2/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure2/log.expect b/src/test/test-resource-failure2/log.expect
new file mode 100644
index 0000000..3c827f8
--- /dev/null
+++ b/src/test/test-resource-failure2/log.expect
@@ -0,0 +1,44 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'fa:130' on node 'node2'
+info 20 node1/crm: service 'fa:130': state changed from 'started' to 'request_stop'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 23 node2/lrm: got lock 'ha_agent_node2_lock'
+info 23 node2/lrm: status change wait_for_agent_lock => active
+info 24 node3/crm: status change wait_for_quorum => slave
+info 40 node1/crm: service 'fa:130': state changed from 'request_stop' to 'stopped'
+info 120 cmdlist: execute service fa:130 enabled
+info 120 node1/crm: service 'fa:130': state changed from 'stopped' to 'started' (node = node2)
+info 123 node2/lrm: starting service fa:130
+warn 123 node2/lrm: unable to start service fa:130
+warn 123 node2/lrm: restart policy: retry number 1 for service 'fa:130'
+info 143 node2/lrm: starting service fa:130
+warn 143 node2/lrm: unable to start service fa:130
+warn 143 node2/lrm: restart policy: retry number 2 for service 'fa:130'
+info 163 node2/lrm: starting service fa:130
+warn 163 node2/lrm: unable to start service fa:130
+err 163 node2/lrm: unable to start service fa:130 on local node after 2 retries
+warn 180 node1/crm: starting service fa:130 on node 'node2' failed, relocating service.
+info 180 node1/crm: relocate service 'fa:130' to node 'node1'
+info 180 node1/crm: service 'fa:130': state changed from 'started' to 'relocate' (node = node2, target = node1)
+info 183 node2/lrm: service fa:130 - start relocate to node 'node1'
+info 183 node2/lrm: service fa:130 - end relocate to node 'node1'
+info 200 node1/crm: service 'fa:130': state changed from 'relocate' to 'started' (node = node1)
+info 201 node1/lrm: got lock 'ha_agent_node1_lock'
+info 201 node1/lrm: status change wait_for_agent_lock => active
+info 201 node1/lrm: starting service fa:130
+info 201 node1/lrm: service status fa:130 started
+info 720 hardware: exit simulation - done
diff --git a/src/test/test-resource-failure2/manager_status b/src/test/test-resource-failure2/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure2/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure2/service_config b/src/test/test-resource-failure2/service_config
new file mode 100644
index 0000000..a3f5459
--- /dev/null
+++ b/src/test/test-resource-failure2/service_config
@@ -0,0 +1,3 @@
+{
+ "fa:130": { "node": "node2", "max_restart": "2", "group" : "all" }
+}
diff --git a/src/test/test-resource-failure3/README b/src/test/test-resource-failure3/README
new file mode 100644
index 0000000..2cd4cdd
--- /dev/null
+++ b/src/test/test-resource-failure3/README
@@ -0,0 +1,3 @@
+Test the behaviour if a service fails to migrate, use Service 'fa:101'
+for that purpose.
+We expect that it gets marked as started again at the source node.
diff --git a/src/test/test-resource-failure3/cmdlist b/src/test/test-resource-failure3/cmdlist
new file mode 100644
index 0000000..586fe8f
--- /dev/null
+++ b/src/test/test-resource-failure3/cmdlist
@@ -0,0 +1,4 @@
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service fa:101 migrate node3" ]
+]
diff --git a/src/test/test-resource-failure3/hardware_status b/src/test/test-resource-failure3/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure3/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure3/log.expect b/src/test/test-resource-failure3/log.expect
new file mode 100644
index 0000000..857c094
--- /dev/null
+++ b/src/test/test-resource-failure3/log.expect
@@ -0,0 +1,30 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'fa:101' on node 'node2'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 23 node2/lrm: got lock 'ha_agent_node2_lock'
+info 23 node2/lrm: status change wait_for_agent_lock => active
+info 23 node2/lrm: starting service fa:101
+info 23 node2/lrm: service status fa:101 started
+info 24 node3/crm: status change wait_for_quorum => slave
+info 120 cmdlist: execute service fa:101 migrate node3
+info 120 node1/crm: got crm command: migrate fa:101 node3
+info 120 node1/crm: migrate service 'fa:101' to node 'node3'
+info 120 node1/crm: service 'fa:101': state changed from 'started' to 'migrate' (node = node2, target = node3)
+err 123 node2/lrm: service fa:101 not moved (migration error)
+err 140 node1/crm: service 'fa:101' - migration failed (exit code 1)
+info 140 node1/crm: service 'fa:101': state changed from 'migrate' to 'started' (node = node2)
+info 720 hardware: exit simulation - done
diff --git a/src/test/test-resource-failure3/manager_status b/src/test/test-resource-failure3/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure3/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure3/service_config b/src/test/test-resource-failure3/service_config
new file mode 100644
index 0000000..d596b9c
--- /dev/null
+++ b/src/test/test-resource-failure3/service_config
@@ -0,0 +1,3 @@
+{
+ "fa:101": { "node": "node2", "group" : "all", "state" : "enabled" }
+}
diff --git a/src/test/test-resource-failure4/README b/src/test/test-resource-failure4/README
new file mode 100644
index 0000000..367640e
--- /dev/null
+++ b/src/test/test-resource-failure4/README
@@ -0,0 +1,3 @@
+Test the behaviour if a service fails to stop, we should get an error
+until the user kills the process or deletes the service from HA.
+For simplicity we do the latter here.
diff --git a/src/test/test-resource-failure4/cmdlist b/src/test/test-resource-failure4/cmdlist
new file mode 100644
index 0000000..d5639f3
--- /dev/null
+++ b/src/test/test-resource-failure4/cmdlist
@@ -0,0 +1,5 @@
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service fa:1001 disabled" ],
+ [ "service fa:1001 delete" ]
+]
diff --git a/src/test/test-resource-failure4/hardware_status b/src/test/test-resource-failure4/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure4/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure4/log.expect b/src/test/test-resource-failure4/log.expect
new file mode 100644
index 0000000..3e8eef9
--- /dev/null
+++ b/src/test/test-resource-failure4/log.expect
@@ -0,0 +1,42 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'fa:1001' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service fa:1001
+info 25 node3/lrm: service status fa:1001 started
+info 120 cmdlist: execute service fa:1001 disabled
+info 120 node1/crm: service 'fa:1001': state changed from 'started' to 'request_stop'
+info 125 node3/lrm: stopping service fa:1001
+info 125 node3/lrm: unable to stop stop service fa:1001 (still running)
+err 140 node1/crm: service 'fa:1001' stop failed (exit code 1)
+info 140 node1/crm: service 'fa:1001': state changed from 'request_stop' to 'error'
+info 140 node1/crm: service 'fa:1001': state changed from 'error' to 'stopped'
+info 145 node3/lrm: stopping service fa:1001
+info 145 node3/lrm: unable to stop stop service fa:1001 (still running)
+info 165 node3/lrm: stopping service fa:1001
+info 165 node3/lrm: unable to stop stop service fa:1001 (still running)
+info 185 node3/lrm: stopping service fa:1001
+info 185 node3/lrm: unable to stop stop service fa:1001 (still running)
+info 205 node3/lrm: stopping service fa:1001
+info 205 node3/lrm: unable to stop stop service fa:1001 (still running)
+info 220 cmdlist: execute service fa:1001 delete
+info 220 node1/crm: removing stale service 'fa:1001' (no config)
+info 222 node2/crm: status change slave => wait_for_quorum
+info 224 node3/crm: status change slave => wait_for_quorum
+info 820 hardware: exit simulation - done
diff --git a/src/test/test-resource-failure4/manager_status b/src/test/test-resource-failure4/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure4/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure4/service_config b/src/test/test-resource-failure4/service_config
new file mode 100644
index 0000000..7dc11a3
--- /dev/null
+++ b/src/test/test-resource-failure4/service_config
@@ -0,0 +1,3 @@
+{
+ "fa:1001": { "node": "node3", "state" : "enabled" }
+}
diff --git a/src/test/test-resource-failure5/README b/src/test/test-resource-failure5/README
new file mode 100644
index 0000000..1ed06c8
--- /dev/null
+++ b/src/test/test-resource-failure5/README
@@ -0,0 +1,4 @@
+Test restart policy if service fails one multiple times and may not relocate.
+The LRM should try again to start it again but ultimately fail and place it
+in the error state.
+Then we execute a command to disable it.
diff --git a/src/test/test-resource-failure5/cmdlist b/src/test/test-resource-failure5/cmdlist
new file mode 100644
index 0000000..ff47c70
--- /dev/null
+++ b/src/test/test-resource-failure5/cmdlist
@@ -0,0 +1,5 @@
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service fa:130 enabled" ],
+ [ "service fa:130 disabled" ]
+]
diff --git a/src/test/test-resource-failure5/hardware_status b/src/test/test-resource-failure5/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure5/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure5/log.expect b/src/test/test-resource-failure5/log.expect
new file mode 100644
index 0000000..f17421a
--- /dev/null
+++ b/src/test/test-resource-failure5/log.expect
@@ -0,0 +1,38 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'fa:130' on node 'node2'
+info 20 node1/crm: service 'fa:130': state changed from 'started' to 'request_stop'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 23 node2/lrm: got lock 'ha_agent_node2_lock'
+info 23 node2/lrm: status change wait_for_agent_lock => active
+info 24 node3/crm: status change wait_for_quorum => slave
+info 40 node1/crm: service 'fa:130': state changed from 'request_stop' to 'stopped'
+info 120 cmdlist: execute service fa:130 enabled
+info 120 node1/crm: service 'fa:130': state changed from 'stopped' to 'started' (node = node2)
+info 123 node2/lrm: starting service fa:130
+warn 123 node2/lrm: unable to start service fa:130
+warn 123 node2/lrm: restart policy: retry number 1 for service 'fa:130'
+info 143 node2/lrm: starting service fa:130
+warn 143 node2/lrm: unable to start service fa:130
+err 143 node2/lrm: unable to start service fa:130 on local node after 1 retries
+err 160 node1/crm: recovery policy for service fa:130 failed, entering error state!
+info 160 node1/crm: service 'fa:130': state changed from 'started' to 'error'
+warn 163 node2/lrm: service fa:130 is not running and in an error state
+warn 183 node2/lrm: service fa:130 is not running and in an error state
+warn 203 node2/lrm: service fa:130 is not running and in an error state
+info 220 cmdlist: execute service fa:130 disabled
+info 220 node1/crm: service 'fa:130': state changed from 'error' to 'stopped'
+info 820 hardware: exit simulation - done
diff --git a/src/test/test-resource-failure5/manager_status b/src/test/test-resource-failure5/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure5/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure5/service_config b/src/test/test-resource-failure5/service_config
new file mode 100644
index 0000000..bc033d1
--- /dev/null
+++ b/src/test/test-resource-failure5/service_config
@@ -0,0 +1,3 @@
+{
+ "fa:130": { "node": "node2", "max_restart" : "1", "max_relocate" : "0" }
+}
--
2.1.4
More information about the pve-devel
mailing list