[pve-devel] [RFC ha-manager] add migrate shutdown policy with an maintenance mode POC
Thomas Lamprecht
t.lamprecht at proxmox.com
Fri Oct 4 19:42:53 CEST 2019
This adds handling for a new shutdown policy, namely "migrate".
If that is set then the LRM doesn't queues stop jobs, but transitions
to a new mode, namely 'maintenance'.
The lrm modes now get passed from the CRM in the NodeStatus update
method, this allows to detect such a mode and make node-status state
transitions. Effecitvely we only allow to transition if we're
currently online, else this is ignored. 'maintenance' does not
protects from fencing,
The moving then gets done by select service node. A node in
maintenance mode is not in "list_online_nodes" and so also not in
online_node_usage used to re-calculate if a service needs to be
moved. Only started services will get moved, this can be done almost
by levaraging exiting behavior, the next_state_started FSM state
transition method just needs to be thought to not early return for
nodes which are not online but in maintenance mode.
A small test adapted from the other policy tests is added to showcase
behavior.
Note this was *not* tested outside the simulation/test framework and
may have issues regarding design and/or implementation there.
But it seems easy enough, and could be fleshed out further to see if
all edge cases hold up (at least as good as other HA parts do)
For real world the datacenter.cfg schema needs to be changed to allow
the migrate shutdown policy, but that's trivial
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/LRM.pm | 36 +++++++++---
src/PVE/HA/Manager.pm | 10 ++--
src/PVE/HA/NodeStatus.pm | 14 ++++-
src/test/test-shutdown-policy3/cmdlist | 4 ++
src/test/test-shutdown-policy3/datacenter.cfg | 5 ++
.../test-shutdown-policy3/hardware_status | 5 ++
src/test/test-shutdown-policy3/log.expect | 58 +++++++++++++++++++
src/test/test-shutdown-policy3/manager_status | 1 +
src/test/test-shutdown-policy3/service_config | 4 ++
9 files changed, 123 insertions(+), 14 deletions(-)
create mode 100644 src/test/test-shutdown-policy3/cmdlist
create mode 100644 src/test/test-shutdown-policy3/datacenter.cfg
create mode 100644 src/test/test-shutdown-policy3/hardware_status
create mode 100644 src/test/test-shutdown-policy3/log.expect
create mode 100644 src/test/test-shutdown-policy3/manager_status
create mode 100644 src/test/test-shutdown-policy3/service_config
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index 3b4a572..83028c4 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -16,6 +16,7 @@ use PVE::HA::Resources;
my $valid_states = {
wait_for_agent_lock => "waiting for agent lock",
active => "got agent_lock",
+ maintenance => "going into maintenance",
lost_agent_lock => "lost agent_lock",
};
@@ -61,18 +62,21 @@ sub shutdown_request {
}
my $freeze_all;
+ my $maintenance;
if ($shutdown_policy eq 'conditional') {
$freeze_all = $reboot;
} elsif ($shutdown_policy eq 'freeze') {
$freeze_all = 1;
} elsif ($shutdown_policy eq 'failover') {
$freeze_all = 0;
+ } elsif ($shutdown_policy eq 'migrate') {
+ $maintenance = 1;
} else {
$haenv->log('err', "unknown shutdown policy '$shutdown_policy', fall back to conditional");
$freeze_all = $reboot;
}
- if ($shutdown) {
+ if ($shutdown && !$maintenance) {
# *always* queue stop jobs for all services if the node shuts down,
# independent if it's a reboot or a poweroff, else we may corrupt
# services or hinder node shutdown
@@ -88,12 +92,12 @@ sub shutdown_request {
}
if ($shutdown) {
- if ($freeze_all) {
- if ($reboot) {
- $haenv->log('info', "reboot LRM, stop and freeze all services");
- } else {
- $haenv->log('info', "shutdown LRM, stop and freeze all services");
- }
+ my $shutdown_type = $reboot ? 'reboot' : 'shutdown';
+ if ($maintenance) {
+ $haenv->log('info', "$shutdown_type LRM, doing maintenance, removing this node from active list");
+ $self->{mode} = 'maintenance';
+ } elsif ($freeze_all) {
+ $haenv->log('info', "$shutdown_type LRM, stop and freeze all services");
$self->{mode} = 'restart';
} else {
$haenv->log('info', "shutdown LRM, stop all services");
@@ -106,7 +110,7 @@ sub shutdown_request {
$self->{shutdown_request} = 1;
- eval { $self->update_lrm_status(); };
+ eval { $self->update_lrm_status() or die "not quorate?\n"; };
if (my $err = $@) {
$self->log('err', "unable to update lrm status file - $err");
}
@@ -355,6 +359,22 @@ sub work {
$haenv->release_ha_agent_lock();
}
}
+ } elsif ($self->{mode} eq 'maintenance') {
+ # wait until all active services moved away
+ my $service_count = $self->active_service_count();
+ if ($service_count == 0 && $self->run_workers() == 0) {
+ if ($self->{ha_agent_wd}) {
+ $haenv->watchdog_close($self->{ha_agent_wd});
+ delete $self->{ha_agent_wd};
+ }
+
+ $shutdown = 1;
+
+ # restart with no or freezed services, release the lock
+ $haenv->release_ha_agent_lock();
+ } else {
+ $self->manage_resources();
+ }
} else {
if ($self->run_workers() == 0) {
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 5137de8..b762d63 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -361,15 +361,16 @@ sub manage {
my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss});
- $ns->update($haenv->get_node_info());
+ my ($node_info) = $haenv->get_node_info();
+ my ($lrm_results, $lrm_modes) = $self->read_lrm_status();
+
+ $ns->update($node_info, $lrm_modes);
if (!$ns->node_is_online($haenv->nodename())) {
$haenv->log('info', "master seems offline");
return;
}
- my ($lrm_results, $lrm_modes) = $self->read_lrm_status();
-
my $sc = $haenv->read_service_config();
$self->{groups} = $haenv->read_group_config(); # update
@@ -626,8 +627,9 @@ sub next_state_started {
if (!$ns->node_is_online($sd->{node})) {
if ($ns->node_is_offline_delayed($sd->{node})) {
&$change_service_state($self, $sid, 'fence');
+ } elsif ($ns->get_node_state($sd->{node}) ne 'maintenance') {
+ return;
}
- return;
}
if ($cd->{state} eq 'disabled' || $cd->{state} eq 'stopped') {
diff --git a/src/PVE/HA/NodeStatus.pm b/src/PVE/HA/NodeStatus.pm
index 8784110..22e6ab6 100644
--- a/src/PVE/HA/NodeStatus.pm
+++ b/src/PVE/HA/NodeStatus.pm
@@ -24,6 +24,7 @@ sub new {
# possible node state:
my $valid_node_states = {
online => "node online and member of quorate partition",
+ maintenance => "node is a member of quorate partition but currently not able to do work",
unknown => "not member of quorate partition, but possibly still running",
fence => "node needs to be fenced",
gone => "node vanished from cluster members list, possibly deleted"
@@ -117,12 +118,13 @@ my $set_node_state = sub {
};
sub update {
- my ($self, $node_info) = @_;
+ my ($self, $node_info, $lrm_modes) = @_;
my $haenv = $self->{haenv};
foreach my $node (sort keys %$node_info) {
my $d = $node_info->{$node};
+ my $lrm_mode = $lrm_modes->{$node} // 'unkown';
next if !$d->{online};
# record last time the node was online (required to implement fence delay)
@@ -131,11 +133,19 @@ sub update {
my $state = $self->get_node_state($node);
if ($state eq 'online') {
+ if ($lrm_mode eq 'maintenance') {
+ #$haenv->log('info', "update node state maintance");
+ $set_node_state->($self, $node, 'maintenance');
+ }
# &$set_node_state($self, $node, 'online');
} elsif ($state eq 'unknown' || $state eq 'gone') {
&$set_node_state($self, $node, 'online');
} elsif ($state eq 'fence') {
# do nothing, wait until fenced
+ } elsif ($state eq 'maintenance') {
+ if ($lrm_mode ne 'maintenance') {
+ $set_node_state->($self, $node, 'online');
+ }
} else {
die "detected unknown node state '$state";
}
@@ -149,7 +159,7 @@ sub update {
# node is not inside quorate partition, possibly not active
- if ($state eq 'online') {
+ if ($state eq 'online' || $state eq 'maintenance') {
&$set_node_state($self, $node, 'unknown');
} elsif ($state eq 'unknown') {
diff --git a/src/test/test-shutdown-policy3/cmdlist b/src/test/test-shutdown-policy3/cmdlist
new file mode 100644
index 0000000..8558351
--- /dev/null
+++ b/src/test/test-shutdown-policy3/cmdlist
@@ -0,0 +1,4 @@
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "reboot node3" ]
+]
diff --git a/src/test/test-shutdown-policy3/datacenter.cfg b/src/test/test-shutdown-policy3/datacenter.cfg
new file mode 100644
index 0000000..de0bf81
--- /dev/null
+++ b/src/test/test-shutdown-policy3/datacenter.cfg
@@ -0,0 +1,5 @@
+{
+ "ha": {
+ "shutdown_policy": "migrate"
+ }
+}
diff --git a/src/test/test-shutdown-policy3/hardware_status b/src/test/test-shutdown-policy3/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-shutdown-policy3/hardware_status
@@ -0,0 +1,5 @@
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-shutdown-policy3/log.expect b/src/test/test-shutdown-policy3/log.expect
new file mode 100644
index 0000000..9f9879e
--- /dev/null
+++ b/src/test/test-shutdown-policy3/log.expect
@@ -0,0 +1,58 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'ct:102' on node 'node3'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service ct:102
+info 25 node3/lrm: service status ct:102 started
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 120 cmdlist: execute reboot node3
+info 120 node3/lrm: got shutdown request with shutdown policy 'migrate'
+info 120 node3/lrm: reboot LRM, doing maintenance, removing this node from active list
+info 120 node1/crm: node 'node3': state changed from 'online' => 'maintenance'
+info 120 node1/crm: relocate service 'ct:102' to node 'node1'
+info 120 node1/crm: service 'ct:102': state changed from 'started' to 'relocate' (node = node3, target = node1)
+info 120 node1/crm: migrate service 'vm:103' to node 'node1' (running)
+info 120 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node3, target = node1)
+info 125 node3/lrm: service ct:102 - start relocate to node 'node1'
+info 125 node3/lrm: stopping service ct:102 (relocate)
+info 125 node3/lrm: service status ct:102 stopped
+info 125 node3/lrm: service ct:102 - end relocate to node 'node1'
+info 125 node3/lrm: service vm:103 - start migrate to node 'node1'
+info 125 node3/lrm: service vm:103 - end migrate to node 'node1'
+info 140 node1/crm: service 'ct:102': state changed from 'relocate' to 'started' (node = node1)
+info 140 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node1)
+info 141 node1/lrm: got lock 'ha_agent_node1_lock'
+info 141 node1/lrm: status change wait_for_agent_lock => active
+info 141 node1/lrm: starting service ct:102
+info 141 node1/lrm: service status ct:102 started
+info 141 node1/lrm: starting service vm:103
+info 141 node1/lrm: service status vm:103 started
+info 146 node3/lrm: exit (loop end)
+info 146 reboot: execute crm node3 stop
+info 145 node3/crm: server received shutdown request
+info 165 node3/crm: exit (loop end)
+info 165 reboot: execute power node3 off
+info 165 reboot: execute power node3 on
+info 165 node3/crm: status change startup => wait_for_quorum
+info 160 node3/lrm: status change startup => wait_for_agent_lock
+info 180 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
+info 184 node3/crm: status change wait_for_quorum => slave
+info 720 hardware: exit simulation - done
diff --git a/src/test/test-shutdown-policy3/manager_status b/src/test/test-shutdown-policy3/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-shutdown-policy3/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-shutdown-policy3/service_config b/src/test/test-shutdown-policy3/service_config
new file mode 100644
index 0000000..8ee94b5
--- /dev/null
+++ b/src/test/test-shutdown-policy3/service_config
@@ -0,0 +1,4 @@
+{
+ "vm:103": { "node": "node3", "state": "enabled" },
+ "ct:102": { "node": "node3", "state": "enabled" }
+}
--
2.20.1
More information about the pve-devel
mailing list