[pve-devel] [PATCH ha-manager v3 3/6] remove state transition from error to fence state

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Sep 14 11:29:42 CEST 2016


Remove the possible transition from error to fence state. The error
state is an end state and mustn't be left by some automatic action
but on manual intervention!

This also allows us later on to place a service which is not
recoverable from the fence in the error state without generating
and endless loop of state changes.

Add a regression test for a failed node with a service in error
state.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Manager.pm                            |  5 -----
 src/test/test-erroneous-service1/README          |  2 ++
 src/test/test-erroneous-service1/cmdlist         |  4 ++++
 src/test/test-erroneous-service1/hardware_status |  5 +++++
 src/test/test-erroneous-service1/log.expect      | 18 ++++++++++++++++++
 src/test/test-erroneous-service1/manager_status  | 15 +++++++++++++++
 src/test/test-erroneous-service1/service_config  |  3 +++
 7 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100644 src/test/test-erroneous-service1/README
 create mode 100644 src/test/test-erroneous-service1/cmdlist
 create mode 100644 src/test/test-erroneous-service1/hardware_status
 create mode 100644 src/test/test-erroneous-service1/log.expect
 create mode 100644 src/test/test-erroneous-service1/manager_status
 create mode 100644 src/test/test-erroneous-service1/service_config

diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index e3d6ffa..c60df7c 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -704,11 +704,6 @@ sub next_state_error {
 	return;
     }
 
-    if ($ns->node_is_offline_delayed($sd->{node})) {
-	&$change_service_state($self, $sid, 'fence');
-	return;
-    }
-
 }
 
 1;
diff --git a/src/test/test-erroneous-service1/README b/src/test/test-erroneous-service1/README
new file mode 100644
index 0000000..dccd247
--- /dev/null
+++ b/src/test/test-erroneous-service1/README
@@ -0,0 +1,2 @@
+Test failover after single node network failure and a service already in the
+error state. The service should not get touched as it needs manual intervention.
diff --git a/src/test/test-erroneous-service1/cmdlist b/src/test/test-erroneous-service1/cmdlist
new file mode 100644
index 0000000..c0a4daa
--- /dev/null
+++ b/src/test/test-erroneous-service1/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on" ],
+    [ "network node3 off" ]
+]
diff --git a/src/test/test-erroneous-service1/hardware_status b/src/test/test-erroneous-service1/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-erroneous-service1/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-erroneous-service1/log.expect b/src/test/test-erroneous-service1/log.expect
new file mode 100644
index 0000000..bbb4cda
--- /dev/null
+++ b/src/test/test-erroneous-service1/log.expect
@@ -0,0 +1,18 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info    120      cmdlist: execute network node3 off
+info    120    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    124    node3/crm: status change slave => wait_for_quorum
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-erroneous-service1/manager_status b/src/test/test-erroneous-service1/manager_status
new file mode 100644
index 0000000..dc16f94
--- /dev/null
+++ b/src/test/test-erroneous-service1/manager_status
@@ -0,0 +1,15 @@
+{
+    "master_node": "node1",
+    "node_status": {
+        "node1": "online",
+        "node2": "online",
+        "node3": "online"
+    },
+    "service_status": {
+        "vm:103": {
+            "node": "node3",
+            "state": "error",
+            "uid": "kVYSNAeLNiBHm0ceiyNovg"
+        }
+    }
+}
diff --git a/src/test/test-erroneous-service1/service_config b/src/test/test-erroneous-service1/service_config
new file mode 100644
index 0000000..c6860e7
--- /dev/null
+++ b/src/test/test-erroneous-service1/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
-- 
2.1.4





More information about the pve-devel mailing list