[pve-devel] applied: [PATCH ha-manager 2/3] do simple fallback if node comes back online from maintenance
Thomas Lamprecht
t.lamprecht at proxmox.com
Mon Nov 25 19:49:12 CET 2019
We simply remember the node we where on, if moved for maintenance.
This record gets dropped once we move to _any_ other node, be it:
* our previous node, as it came back from maintenance
* another node due to manual migration, group priority changes or
fencing
The first point is handled explicitly by this patch. In the select
service node we check for and old fallback node, if that one is found
in a online node list with top priority we _always_ move to it - even
if there's no real reason for a move.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Manager.pm | 39 +++++++++++++++++--
.../log.expect | 16 ++++++++
src/test/test-shutdown-policy3/log.expect | 20 ++++++++++
3 files changed, 71 insertions(+), 4 deletions(-)
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 1f14754..9e46f19 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -93,7 +93,7 @@ sub get_node_priority_groups {
}
sub select_service_node {
- my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes) = @_;
+ my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
my $group = get_service_group($groups, $online_node_usage, $service_conf);
@@ -123,12 +123,19 @@ sub select_service_node {
} keys %{$pri_groups->{$top_pri}};
my $found;
+ my $found_maintenace_fallback;
for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) {
my $node = $nodes[$i];
if ($node eq $current_node) {
$found = $i;
- last;
}
+ if (defined($maintenance_fallback) && $node eq $maintenance_fallback) {
+ $found_maintenace_fallback = $i;
+ }
+ }
+
+ if (defined($found_maintenace_fallback)) {
+ return $nodes[$found_maintenace_fallback];
}
if ($try_next) {
@@ -207,6 +214,7 @@ my $change_service_state = sub {
my $old_state = $sd->{state};
my $old_node = $sd->{node};
my $old_failed_nodes = $sd->{failed_nodes};
+ my $old_maintenance_node = $sd->{maintenance_node};
die "no state change" if $old_state eq $new_state; # just to be sure
@@ -217,6 +225,7 @@ my $change_service_state = sub {
$sd->{state} = $new_state;
$sd->{node} = $old_node;
$sd->{failed_nodes} = $old_failed_nodes if defined($old_failed_nodes);
+ $sd->{maintenance_node} = $old_maintenance_node if defined($old_maintenance_node);
my $text_state = '';
foreach my $k (sort keys %params) {
@@ -641,6 +650,10 @@ sub next_state_started {
}
if ($ns->get_node_state($sd->{node}) ne 'maintenance') {
return;
+ } else {
+ # save current node as fallback for when it comes out of
+ # maintenance
+ $sd->{maintenance_node} = $sd->{node};
}
}
@@ -733,11 +746,29 @@ sub next_state_started {
}
}
- my $node = select_service_node($self->{groups}, $self->{online_node_usage},
- $cd, $sd->{node}, $try_next, $sd->{failed_nodes});
+ my $node = select_service_node(
+ $self->{groups},
+ $self->{online_node_usage},
+ $cd,
+ $sd->{node},
+ $try_next,
+ $sd->{failed_nodes},
+ $sd->{maintenance_node},
+ );
if ($node && ($sd->{node} ne $node)) {
$self->{online_node_usage}->{$node}++;
+
+ if (defined(my $fallback = $sd->{maintenance_node})) {
+ if ($node eq $fallback) {
+ $haenv->log('info', "moving service '$sid' back to '$fallback', node came back from maintenance.");
+ delete $sd->{maintenance_node};
+ } elsif ($sd->{node} ne $fallback) {
+ $haenv->log('info', "dropping maintenance fallback node '$fallback' for '$sid'");
+ delete $sd->{maintenance_node};
+ }
+ }
+
if ($cd->{type} eq 'vm') {
$haenv->log('info', "migrate service '$sid' to node '$node' (running)");
&$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node);
diff --git a/src/test/test-shutdown-policy-migrate-fail1/log.expect b/src/test/test-shutdown-policy-migrate-fail1/log.expect
index 79664c7..1bb2291 100644
--- a/src/test/test-shutdown-policy-migrate-fail1/log.expect
+++ b/src/test/test-shutdown-policy-migrate-fail1/log.expect
@@ -102,5 +102,21 @@ info 345 reboot: execute power node3 on
info 345 node3/crm: status change startup => wait_for_quorum
info 340 node3/lrm: status change startup => wait_for_agent_lock
info 360 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
+info 360 node1/crm: moving service 'fa:109' back to 'node3', node came back from maintenance.
+info 360 node1/crm: relocate service 'fa:109' to node 'node3'
+info 360 node1/crm: service 'fa:109': state changed from 'started' to 'relocate' (node = node2, target = node3)
+info 360 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
+info 360 node1/crm: migrate service 'vm:103' to node 'node3' (running)
+info 360 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
+info 361 node1/lrm: service vm:103 - start migrate to node 'node3'
+info 361 node1/lrm: service vm:103 - end migrate to node 'node3'
+err 363 node2/lrm: service fa:109 not moved (migration error)
info 364 node3/crm: status change wait_for_quorum => slave
+err 380 node1/crm: service 'fa:109' - migration failed (exit code 1)
+info 380 node1/crm: service 'fa:109': state changed from 'relocate' to 'started' (node = node2)
+info 380 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
+info 385 node3/lrm: got lock 'ha_agent_node3_lock'
+info 385 node3/lrm: status change wait_for_agent_lock => active
+info 385 node3/lrm: starting service vm:103
+info 385 node3/lrm: service status vm:103 started
info 720 hardware: exit simulation - done
diff --git a/src/test/test-shutdown-policy3/log.expect b/src/test/test-shutdown-policy3/log.expect
index 6ecf211..921c9f3 100644
--- a/src/test/test-shutdown-policy3/log.expect
+++ b/src/test/test-shutdown-policy3/log.expect
@@ -55,5 +55,25 @@ info 165 reboot: execute power node3 on
info 165 node3/crm: status change startup => wait_for_quorum
info 160 node3/lrm: status change startup => wait_for_agent_lock
info 180 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
+info 180 node1/crm: moving service 'ct:102' back to 'node3', node came back from maintenance.
+info 180 node1/crm: relocate service 'ct:102' to node 'node3'
+info 180 node1/crm: service 'ct:102': state changed from 'started' to 'relocate' (node = node1, target = node3)
+info 180 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
+info 180 node1/crm: migrate service 'vm:103' to node 'node3' (running)
+info 180 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
+info 181 node1/lrm: service ct:102 - start relocate to node 'node3'
+info 181 node1/lrm: stopping service ct:102 (relocate)
+info 181 node1/lrm: service status ct:102 stopped
+info 181 node1/lrm: service ct:102 - end relocate to node 'node3'
+info 181 node1/lrm: service vm:103 - start migrate to node 'node3'
+info 181 node1/lrm: service vm:103 - end migrate to node 'node3'
info 184 node3/crm: status change wait_for_quorum => slave
+info 200 node1/crm: service 'ct:102': state changed from 'relocate' to 'started' (node = node3)
+info 200 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
+info 205 node3/lrm: got lock 'ha_agent_node3_lock'
+info 205 node3/lrm: status change wait_for_agent_lock => active
+info 205 node3/lrm: starting service ct:102
+info 205 node3/lrm: service status ct:102 started
+info 205 node3/lrm: starting service vm:103
+info 205 node3/lrm: service status vm:103 started
info 720 hardware: exit simulation - done
--
2.20.1
More information about the pve-devel
mailing list