[pve-devel] [PATCH ha-manager 3/3] add VirtFail resource and use it in new regression tests

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Feb 10 14:13:44 CET 2016


This resource let us test a defined failiure behaviour ofi services.

Through the VMID we define how it should behave, with the folowing
rules:

When the service has the SID "fa:abcde" the digits a - e mean:

a - no meaning but can be used for differentiating similar resources
b - how many tries are needed to start correctly (0=default)
c - how many tries are needed to migrate correctly (0=default)
d - should shutdown be successful (0 = yes, anything else no)
e - return value of $plugin->exists() defaults to 1 if not set

a,b,c should always be set even if b and c have defaults (makes test
purpose clearer)
d and e

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---

Checkout the READMEs of the test fopr a short description of what
they cover.

 src/PVE/HA/Sim/Env.pm                           |  2 +
 src/PVE/HA/Sim/Hardware.pm                      |  4 +-
 src/PVE/HA/Sim/Resources/VirtFail.pm            | 93 +++++++++++++++++++++++++
 src/test/test-resource-failure1/README          |  2 +
 src/test/test-resource-failure1/cmdlist         |  4 ++
 src/test/test-resource-failure1/hardware_status |  5 ++
 src/test/test-resource-failure1/log.expect      | 30 ++++++++
 src/test/test-resource-failure1/manager_status  |  1 +
 src/test/test-resource-failure1/service_config  |  3 +
 src/test/test-resource-failure2/README          |  4 ++
 src/test/test-resource-failure2/cmdlist         |  4 ++
 src/test/test-resource-failure2/groups          |  2 +
 src/test/test-resource-failure2/hardware_status |  5 ++
 src/test/test-resource-failure2/log.expect      | 44 ++++++++++++
 src/test/test-resource-failure2/manager_status  |  1 +
 src/test/test-resource-failure2/service_config  |  3 +
 src/test/test-resource-failure3/README          |  3 +
 src/test/test-resource-failure3/cmdlist         |  4 ++
 src/test/test-resource-failure3/hardware_status |  5 ++
 src/test/test-resource-failure3/log.expect      | 30 ++++++++
 src/test/test-resource-failure3/manager_status  |  1 +
 src/test/test-resource-failure3/service_config  |  3 +
 src/test/test-resource-failure4/README          |  3 +
 src/test/test-resource-failure4/cmdlist         |  5 ++
 src/test/test-resource-failure4/hardware_status |  5 ++
 src/test/test-resource-failure4/log.expect      | 42 +++++++++++
 src/test/test-resource-failure4/manager_status  |  1 +
 src/test/test-resource-failure4/service_config  |  3 +
 src/test/test-resource-failure5/README          |  4 ++
 src/test/test-resource-failure5/cmdlist         |  5 ++
 src/test/test-resource-failure5/hardware_status |  5 ++
 src/test/test-resource-failure5/log.expect      | 38 ++++++++++
 src/test/test-resource-failure5/manager_status  |  1 +
 src/test/test-resource-failure5/service_config  |  3 +
 34 files changed, 367 insertions(+), 1 deletion(-)
 create mode 100644 src/PVE/HA/Sim/Resources/VirtFail.pm
 create mode 100644 src/test/test-resource-failure1/README
 create mode 100644 src/test/test-resource-failure1/cmdlist
 create mode 100644 src/test/test-resource-failure1/hardware_status
 create mode 100644 src/test/test-resource-failure1/log.expect
 create mode 100644 src/test/test-resource-failure1/manager_status
 create mode 100644 src/test/test-resource-failure1/service_config
 create mode 100644 src/test/test-resource-failure2/README
 create mode 100644 src/test/test-resource-failure2/cmdlist
 create mode 100644 src/test/test-resource-failure2/groups
 create mode 100644 src/test/test-resource-failure2/hardware_status
 create mode 100644 src/test/test-resource-failure2/log.expect
 create mode 100644 src/test/test-resource-failure2/manager_status
 create mode 100644 src/test/test-resource-failure2/service_config
 create mode 100644 src/test/test-resource-failure3/README
 create mode 100644 src/test/test-resource-failure3/cmdlist
 create mode 100644 src/test/test-resource-failure3/hardware_status
 create mode 100644 src/test/test-resource-failure3/log.expect
 create mode 100644 src/test/test-resource-failure3/manager_status
 create mode 100644 src/test/test-resource-failure3/service_config
 create mode 100644 src/test/test-resource-failure4/README
 create mode 100644 src/test/test-resource-failure4/cmdlist
 create mode 100644 src/test/test-resource-failure4/hardware_status
 create mode 100644 src/test/test-resource-failure4/log.expect
 create mode 100644 src/test/test-resource-failure4/manager_status
 create mode 100644 src/test/test-resource-failure4/service_config
 create mode 100644 src/test/test-resource-failure5/README
 create mode 100644 src/test/test-resource-failure5/cmdlist
 create mode 100644 src/test/test-resource-failure5/hardware_status
 create mode 100644 src/test/test-resource-failure5/log.expect
 create mode 100644 src/test/test-resource-failure5/manager_status
 create mode 100644 src/test/test-resource-failure5/service_config

diff --git a/src/PVE/HA/Sim/Env.pm b/src/PVE/HA/Sim/Env.pm
index e154988..1978ebc 100644
--- a/src/PVE/HA/Sim/Env.pm
+++ b/src/PVE/HA/Sim/Env.pm
@@ -13,9 +13,11 @@ use PVE::HA::Env;
 use PVE::HA::Resources;
 use PVE::HA::Sim::Resources::VirtVM;
 use PVE::HA::Sim::Resources::VirtCT;
+use PVE::HA::Sim::Resources::VirtFail;
 
 PVE::HA::Sim::Resources::VirtVM->register();
 PVE::HA::Sim::Resources::VirtCT->register();
+PVE::HA::Sim::Resources::VirtFail->register();
 
 PVE::HA::Resources->init();
 
diff --git a/src/PVE/HA/Sim/Hardware.pm b/src/PVE/HA/Sim/Hardware.pm
index 2cbe64d..652f11d 100644
--- a/src/PVE/HA/Sim/Hardware.pm
+++ b/src/PVE/HA/Sim/Hardware.pm
@@ -93,13 +93,15 @@ sub read_service_config {
 
 	die "service '$sid' without assigned node!" if !$d->{node};
 
-	if ($sid =~ m/^(vm|ct):(\d+)$/) {
+	if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
 	    $d->{type} = $1;
 	    $d->{name} = $2;
 	} else {
 	    die "implement me";
 	}
 	$d->{state} = 'disabled' if !$d->{state};
+	$d->{max_restart} = 1 if !defined($d->{max_restart});
+	$d->{max_relocate} = 1 if !defined($d->{max_relocate});
     }
 
     return $conf;
diff --git a/src/PVE/HA/Sim/Resources/VirtFail.pm b/src/PVE/HA/Sim/Resources/VirtFail.pm
new file mode 100644
index 0000000..e421f29
--- /dev/null
+++ b/src/PVE/HA/Sim/Resources/VirtFail.pm
@@ -0,0 +1,93 @@
+package PVE::HA::Sim::Resources::VirtFail;
+
+use strict;
+use warnings;
+
+use base qw(PVE::HA::Sim::Resources);
+
+# This class lets us simulate failing resources for the regression tests
+# To make it more intresting we can encode some bahviour in the VMID
+# with the following format, where fa: is the type and a, b, c, ...
+# are ciffers in base 10
+# fa:abcde
+# meaning:
+# a - no meaning but can be used for differentiating similar resources
+# b - how many tries are needed to start correctly (0 is normal behaviour) (should be set)
+# c - how many tries are needed to migrate correctly (0 is normal behaviour) (should be set)
+# d - should shutdown be successful (0 = yes, anything else no) (optional)
+# e - return value of $plugin->exists() defaults to 1 if not set (optional)
+
+my $decode_id = sub {
+    my $id = shift;
+
+    my ($start, $migrate, $stop, $exists) = $id =~ /^\d(\d)(\d)(\d)?(\d)?/g;
+
+    $start = 0 if !defined($start);
+    $migrate = 0 if !defined($migrate);
+    $stop = 0 if !defined($stop);
+    $exists = 1 if !defined($exists);
+
+    return ($start, $migrate, $stop, $exists)
+};
+
+my $tries = {
+    start => {},
+    migrate => {},
+};
+
+
+sub type {
+    return 'fa';
+}
+
+sub exists {
+    my ($class, $id, $noerr) = @_;
+
+    my (undef, undef, undef, $exists) = &$decode_id($id);
+    print $exists ."\n";
+
+    return $exists;
+}
+
+sub start {
+    my ($class, $haenv, $id) = @_;
+
+    my ($start_failure_count) = &$decode_id($id);
+
+    $tries->{start}->{$id} = 0 if !$tries->{start}->{$id};
+    $tries->{start}->{$id}++;
+
+    return if $start_failure_count >= $tries->{start}->{$id};
+
+    $tries->{start}->{$id} = 0; # reset counts
+
+    return $class->SUPER::start($haenv, $id);
+
+}
+
+sub shutdown {
+    my ($class, $haenv, $id) = @_;
+
+    my (undef, undef, $cannot_stop) = &$decode_id($id);
+
+    return if $cannot_stop;
+
+    return $class->SUPER::shutdown($haenv, $id);
+}
+
+sub migrate {
+    my ($class, $haenv, $id, $target, $online) = @_;
+
+    my (undef, $migrate_failure_count) = &$decode_id($id);
+
+    $tries->{migrate}->{$id} = 0 if !$tries->{migrate}->{$id};
+    $tries->{migrate}->{$id}++;
+
+    return if $migrate_failure_count >= $tries->{migrate}->{$id};
+
+    $tries->{migrate}->{$id} = 0; # reset counts
+
+    return $class->SUPER::migrate($haenv, $id, $target, $online);
+
+}
+1;
diff --git a/src/test/test-resource-failure1/README b/src/test/test-resource-failure1/README
new file mode 100644
index 0000000..e1a6dc9
--- /dev/null
+++ b/src/test/test-resource-failure1/README
@@ -0,0 +1,2 @@
+Test restart policy if service fails one time.
+The LRM should try again to start it and succeed.
diff --git a/src/test/test-resource-failure1/cmdlist b/src/test/test-resource-failure1/cmdlist
new file mode 100644
index 0000000..5489743
--- /dev/null
+++ b/src/test/test-resource-failure1/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service fa:110 enabled" ]
+]
diff --git a/src/test/test-resource-failure1/hardware_status b/src/test/test-resource-failure1/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure1/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure1/log.expect b/src/test/test-resource-failure1/log.expect
new file mode 100644
index 0000000..97d9fdb
--- /dev/null
+++ b/src/test/test-resource-failure1/log.expect
@@ -0,0 +1,30 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'fa:110' on node 'node2'
+info     20    node1/crm: service 'fa:110': state changed from 'started' to 'request_stop' 
+info     22    node2/crm: status change wait_for_quorum => slave
+info     23    node2/lrm: got lock 'ha_agent_node2_lock'
+info     23    node2/lrm: status change wait_for_agent_lock => active
+info     24    node3/crm: status change wait_for_quorum => slave
+info     40    node1/crm: service 'fa:110': state changed from 'request_stop' to 'stopped' 
+info    120      cmdlist: execute service fa:110 enabled
+info    120    node1/crm: service 'fa:110': state changed from 'stopped' to 'started'  (node = node2)
+info    123    node2/lrm: starting service fa:110
+warn    123    node2/lrm: unable to start service fa:110
+warn    123    node2/lrm: restart policy: retry number 1 for service 'fa:110'
+info    143    node2/lrm: starting service fa:110
+info    143    node2/lrm: service status fa:110 started
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-resource-failure1/manager_status b/src/test/test-resource-failure1/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure1/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure1/service_config b/src/test/test-resource-failure1/service_config
new file mode 100644
index 0000000..cf17020
--- /dev/null
+++ b/src/test/test-resource-failure1/service_config
@@ -0,0 +1,3 @@
+{
+    "fa:110": { "node": "node2" }
+}
diff --git a/src/test/test-resource-failure2/README b/src/test/test-resource-failure2/README
new file mode 100644
index 0000000..b9a0340
--- /dev/null
+++ b/src/test/test-resource-failure2/README
@@ -0,0 +1,4 @@
+Test restart in combination with relocate policy.
+Service 'fa:130' fails three times to restart and has a
+'max_restart' policy of 2. So after the second time it
+should be relocated to another node an start there succesfully
diff --git a/src/test/test-resource-failure2/cmdlist b/src/test/test-resource-failure2/cmdlist
new file mode 100644
index 0000000..8f06508
--- /dev/null
+++ b/src/test/test-resource-failure2/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service fa:130 enabled" ]
+]
diff --git a/src/test/test-resource-failure2/groups b/src/test/test-resource-failure2/groups
new file mode 100644
index 0000000..01d634f
--- /dev/null
+++ b/src/test/test-resource-failure2/groups
@@ -0,0 +1,2 @@
+group: all
+	nodes node1,node2,node3
diff --git a/src/test/test-resource-failure2/hardware_status b/src/test/test-resource-failure2/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure2/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure2/log.expect b/src/test/test-resource-failure2/log.expect
new file mode 100644
index 0000000..3c827f8
--- /dev/null
+++ b/src/test/test-resource-failure2/log.expect
@@ -0,0 +1,44 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'fa:130' on node 'node2'
+info     20    node1/crm: service 'fa:130': state changed from 'started' to 'request_stop' 
+info     22    node2/crm: status change wait_for_quorum => slave
+info     23    node2/lrm: got lock 'ha_agent_node2_lock'
+info     23    node2/lrm: status change wait_for_agent_lock => active
+info     24    node3/crm: status change wait_for_quorum => slave
+info     40    node1/crm: service 'fa:130': state changed from 'request_stop' to 'stopped' 
+info    120      cmdlist: execute service fa:130 enabled
+info    120    node1/crm: service 'fa:130': state changed from 'stopped' to 'started'  (node = node2)
+info    123    node2/lrm: starting service fa:130
+warn    123    node2/lrm: unable to start service fa:130
+warn    123    node2/lrm: restart policy: retry number 1 for service 'fa:130'
+info    143    node2/lrm: starting service fa:130
+warn    143    node2/lrm: unable to start service fa:130
+warn    143    node2/lrm: restart policy: retry number 2 for service 'fa:130'
+info    163    node2/lrm: starting service fa:130
+warn    163    node2/lrm: unable to start service fa:130
+err     163    node2/lrm: unable to start service fa:130 on local node after 2 retries
+warn    180    node1/crm: starting service fa:130 on node 'node2' failed, relocating service.
+info    180    node1/crm: relocate service 'fa:130' to node 'node1'
+info    180    node1/crm: service 'fa:130': state changed from 'started' to 'relocate'  (node = node2, target = node1)
+info    183    node2/lrm: service fa:130 - start relocate to node 'node1'
+info    183    node2/lrm: service fa:130 - end relocate to node 'node1'
+info    200    node1/crm: service 'fa:130': state changed from 'relocate' to 'started'  (node = node1)
+info    201    node1/lrm: got lock 'ha_agent_node1_lock'
+info    201    node1/lrm: status change wait_for_agent_lock => active
+info    201    node1/lrm: starting service fa:130
+info    201    node1/lrm: service status fa:130 started
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-resource-failure2/manager_status b/src/test/test-resource-failure2/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure2/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure2/service_config b/src/test/test-resource-failure2/service_config
new file mode 100644
index 0000000..a3f5459
--- /dev/null
+++ b/src/test/test-resource-failure2/service_config
@@ -0,0 +1,3 @@
+{
+    "fa:130": { "node": "node2", "max_restart": "2", "group" : "all" }
+}
diff --git a/src/test/test-resource-failure3/README b/src/test/test-resource-failure3/README
new file mode 100644
index 0000000..2cd4cdd
--- /dev/null
+++ b/src/test/test-resource-failure3/README
@@ -0,0 +1,3 @@
+Test the behaviour if a service fails to migrate, use Service 'fa:101'
+for that purpose.
+We expect that it gets marked as started again at the source node.
diff --git a/src/test/test-resource-failure3/cmdlist b/src/test/test-resource-failure3/cmdlist
new file mode 100644
index 0000000..586fe8f
--- /dev/null
+++ b/src/test/test-resource-failure3/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service fa:101 migrate node3" ]
+]
diff --git a/src/test/test-resource-failure3/hardware_status b/src/test/test-resource-failure3/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure3/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure3/log.expect b/src/test/test-resource-failure3/log.expect
new file mode 100644
index 0000000..857c094
--- /dev/null
+++ b/src/test/test-resource-failure3/log.expect
@@ -0,0 +1,30 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'fa:101' on node 'node2'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     23    node2/lrm: got lock 'ha_agent_node2_lock'
+info     23    node2/lrm: status change wait_for_agent_lock => active
+info     23    node2/lrm: starting service fa:101
+info     23    node2/lrm: service status fa:101 started
+info     24    node3/crm: status change wait_for_quorum => slave
+info    120      cmdlist: execute service fa:101 migrate node3
+info    120    node1/crm: got crm command: migrate fa:101 node3
+info    120    node1/crm: migrate service 'fa:101' to node 'node3'
+info    120    node1/crm: service 'fa:101': state changed from 'started' to 'migrate'  (node = node2, target = node3)
+err     123    node2/lrm: service fa:101 not moved (migration error)
+err     140    node1/crm: service 'fa:101' - migration failed (exit code 1)
+info    140    node1/crm: service 'fa:101': state changed from 'migrate' to 'started'  (node = node2)
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-resource-failure3/manager_status b/src/test/test-resource-failure3/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure3/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure3/service_config b/src/test/test-resource-failure3/service_config
new file mode 100644
index 0000000..d596b9c
--- /dev/null
+++ b/src/test/test-resource-failure3/service_config
@@ -0,0 +1,3 @@
+{
+    "fa:101": { "node": "node2", "group" : "all", "state" : "enabled" }
+}
diff --git a/src/test/test-resource-failure4/README b/src/test/test-resource-failure4/README
new file mode 100644
index 0000000..367640e
--- /dev/null
+++ b/src/test/test-resource-failure4/README
@@ -0,0 +1,3 @@
+Test the behaviour if a service fails to stop, we should get an error
+until the user kills the process or deletes the service from HA.
+For simplicity we do the latter here.
diff --git a/src/test/test-resource-failure4/cmdlist b/src/test/test-resource-failure4/cmdlist
new file mode 100644
index 0000000..d5639f3
--- /dev/null
+++ b/src/test/test-resource-failure4/cmdlist
@@ -0,0 +1,5 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service fa:1001 disabled" ],
+    [ "service fa:1001 delete" ]
+]
diff --git a/src/test/test-resource-failure4/hardware_status b/src/test/test-resource-failure4/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure4/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure4/log.expect b/src/test/test-resource-failure4/log.expect
new file mode 100644
index 0000000..3e8eef9
--- /dev/null
+++ b/src/test/test-resource-failure4/log.expect
@@ -0,0 +1,42 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'fa:1001' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service fa:1001
+info     25    node3/lrm: service status fa:1001 started
+info    120      cmdlist: execute service fa:1001 disabled
+info    120    node1/crm: service 'fa:1001': state changed from 'started' to 'request_stop' 
+info    125    node3/lrm: stopping service fa:1001
+info    125    node3/lrm: unable to stop stop service fa:1001 (still running)
+err     140    node1/crm: service 'fa:1001' stop failed (exit code 1)
+info    140    node1/crm: service 'fa:1001': state changed from 'request_stop' to 'error' 
+info    140    node1/crm: service 'fa:1001': state changed from 'error' to 'stopped' 
+info    145    node3/lrm: stopping service fa:1001
+info    145    node3/lrm: unable to stop stop service fa:1001 (still running)
+info    165    node3/lrm: stopping service fa:1001
+info    165    node3/lrm: unable to stop stop service fa:1001 (still running)
+info    185    node3/lrm: stopping service fa:1001
+info    185    node3/lrm: unable to stop stop service fa:1001 (still running)
+info    205    node3/lrm: stopping service fa:1001
+info    205    node3/lrm: unable to stop stop service fa:1001 (still running)
+info    220      cmdlist: execute service fa:1001 delete
+info    220    node1/crm: removing stale service 'fa:1001' (no config)
+info    222    node2/crm: status change slave => wait_for_quorum
+info    224    node3/crm: status change slave => wait_for_quorum
+info    820     hardware: exit simulation - done
diff --git a/src/test/test-resource-failure4/manager_status b/src/test/test-resource-failure4/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure4/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure4/service_config b/src/test/test-resource-failure4/service_config
new file mode 100644
index 0000000..7dc11a3
--- /dev/null
+++ b/src/test/test-resource-failure4/service_config
@@ -0,0 +1,3 @@
+{
+    "fa:1001": { "node": "node3", "state" : "enabled" }
+}
diff --git a/src/test/test-resource-failure5/README b/src/test/test-resource-failure5/README
new file mode 100644
index 0000000..1ed06c8
--- /dev/null
+++ b/src/test/test-resource-failure5/README
@@ -0,0 +1,4 @@
+Test restart policy if service fails one multiple times and may not relocate.
+The LRM should try again to start it again but ultimately fail and place it
+in the error state.
+Then we execute a command to disable it.
diff --git a/src/test/test-resource-failure5/cmdlist b/src/test/test-resource-failure5/cmdlist
new file mode 100644
index 0000000..ff47c70
--- /dev/null
+++ b/src/test/test-resource-failure5/cmdlist
@@ -0,0 +1,5 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service fa:130 enabled" ],
+    [ "service fa:130 disabled" ]
+]
diff --git a/src/test/test-resource-failure5/hardware_status b/src/test/test-resource-failure5/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-resource-failure5/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-resource-failure5/log.expect b/src/test/test-resource-failure5/log.expect
new file mode 100644
index 0000000..f17421a
--- /dev/null
+++ b/src/test/test-resource-failure5/log.expect
@@ -0,0 +1,38 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'fa:130' on node 'node2'
+info     20    node1/crm: service 'fa:130': state changed from 'started' to 'request_stop' 
+info     22    node2/crm: status change wait_for_quorum => slave
+info     23    node2/lrm: got lock 'ha_agent_node2_lock'
+info     23    node2/lrm: status change wait_for_agent_lock => active
+info     24    node3/crm: status change wait_for_quorum => slave
+info     40    node1/crm: service 'fa:130': state changed from 'request_stop' to 'stopped' 
+info    120      cmdlist: execute service fa:130 enabled
+info    120    node1/crm: service 'fa:130': state changed from 'stopped' to 'started'  (node = node2)
+info    123    node2/lrm: starting service fa:130
+warn    123    node2/lrm: unable to start service fa:130
+warn    123    node2/lrm: restart policy: retry number 1 for service 'fa:130'
+info    143    node2/lrm: starting service fa:130
+warn    143    node2/lrm: unable to start service fa:130
+err     143    node2/lrm: unable to start service fa:130 on local node after 1 retries
+err     160    node1/crm: recovery policy for service fa:130 failed, entering error state!
+info    160    node1/crm: service 'fa:130': state changed from 'started' to 'error' 
+warn    163    node2/lrm: service fa:130 is not running and in an error state
+warn    183    node2/lrm: service fa:130 is not running and in an error state
+warn    203    node2/lrm: service fa:130 is not running and in an error state
+info    220      cmdlist: execute service fa:130 disabled
+info    220    node1/crm: service 'fa:130': state changed from 'error' to 'stopped' 
+info    820     hardware: exit simulation - done
diff --git a/src/test/test-resource-failure5/manager_status b/src/test/test-resource-failure5/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-resource-failure5/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-resource-failure5/service_config b/src/test/test-resource-failure5/service_config
new file mode 100644
index 0000000..bc033d1
--- /dev/null
+++ b/src/test/test-resource-failure5/service_config
@@ -0,0 +1,3 @@
+{
+    "fa:130": { "node": "node2", "max_restart" : "1", "max_relocate" : "0" }
+}
-- 
2.1.4





More information about the pve-devel mailing list