[pve-devel] [PATCH ha-manager 9/9] manager: make service node usage computation more granular

Daniel Kral d.kral at proxmox.com
Tue Sep 30 16:19:19 CEST 2025


The $online_node_usage is built on every call to manage(...) now, but
can be reduced to only be built on any scheduler mode change (including
initialization or error path to be complete).

This allows recompute_online_node_usage(...) to be reduced to
adding/removing nodes whenever these become online or are not online
anymore and handle the service usage updates whenever these change.
Therefore, recompute_online_node_usage(...) must only be called once in
manage(...) after $ns was properly updated.

Note that this makes the ha-manager not acknowledge any hotplug changes
to the guest configs anymore as long as the HA resource state doesn't
change.

Signed-off-by: Daniel Kral <d.kral at proxmox.com>
---
If we go for this patch, then we would need some mechanism to update the
static usage for a single or all HA resources registered in
$online_node_usage at once (or just rebuilt $online_node_usage at that
point..).

 src/PVE/HA/Manager.pm | 90 +++++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 41 deletions(-)

diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 253deba9..6fadb3f3 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -106,6 +106,7 @@ sub update_crs_scheduler_mode {
     if (!defined($old_mode)) {
         $haenv->log('info', "using scheduler mode '$new_mode'") if $new_mode ne 'basic';
     } elsif ($new_mode eq $old_mode) {
+        $haenv->update_static_service_stats() if $old_mode eq 'static';
         return; # nothing to do
     } else {
         $haenv->log('info', "switching scheduler mode from '$old_mode' to '$new_mode'");
@@ -113,6 +114,39 @@ sub update_crs_scheduler_mode {
 
     $self->{crs}->{scheduler} = $new_mode;
 
+    my $online_node_usage;
+
+    if ($new_mode eq 'static') {
+        $online_node_usage = eval {
+            my $scheduler = PVE::HA::Usage::Static->new($haenv);
+            $scheduler->add_node($_) for $self->{ns}->list_online_nodes()->@*;
+            $haenv->update_static_service_stats();
+            return $scheduler;
+        };
+        if ($@) {
+            $self->{crs}->{scheduler} = 'basic'; # retry on next update
+            $haenv->log(
+                'warning',
+                "fallback to 'basic' scheduler mode, init for 'static' failed - $@",
+            );
+        }
+    } elsif ($new_mode eq 'basic') {
+        # handled below in the general fall-back case
+    } else {
+        $haenv->log('warning', "got unknown scheduler mode '$new_mode', using 'basic'");
+    }
+
+    # fallback to the basic algorithm in any case
+    if (!$online_node_usage) {
+        $online_node_usage = PVE::HA::Usage::Basic->new($haenv);
+        $online_node_usage->add_node($_) for $self->{ns}->list_online_nodes()->@*;
+    }
+
+    $self->{online_node_usage} = $online_node_usage;
+
+    # initialize with current nodes and services states
+    $self->add_service_usage($_, $self->{ss}->{$_}) for keys $self->{ss}->%*;
+
     return;
 }
 
@@ -253,49 +287,19 @@ my $valid_service_states = {
 sub recompute_online_node_usage {
     my ($self) = @_;
 
-    my $haenv = $self->{haenv};
+    my ($haenv, $ns) = $self->@{qw(haenv ns)};
 
-    my $online_nodes = { map { $_ => 1 } $self->{ns}->list_online_nodes()->@* };
+    for my $node ($self->{online_node_usage}->list_nodes()) {
+        next if $ns->node_is_online($node);
 
-    my $online_node_usage;
-
-    if (my $mode = $self->{crs}->{scheduler}) {
-        if ($mode eq 'static') {
-            $online_node_usage = eval {
-                my $scheduler = PVE::HA::Usage::Static->new($haenv);
-                $scheduler->add_node($_) for keys $online_nodes->%*;
-                $haenv->update_static_service_stats();
-                return $scheduler;
-            };
-            $haenv->log(
-                'warning',
-                "fallback to 'basic' scheduler mode, init for 'static' failed - $@",
-            ) if $@;
-        } elsif ($mode eq 'basic') {
-            # handled below in the general fall-back case
-        } else {
-            $haenv->log('warning', "got unknown scheduler mode '$mode', using 'basic'");
-        }
+        $self->{online_node_usage}->remove_node($node);
     }
 
-    # fallback to the basic algorithm in any case
-    if (!$online_node_usage) {
-        $online_node_usage = PVE::HA::Usage::Basic->new($haenv);
-        $online_node_usage->add_node($_) for keys $online_nodes->%*;
+    for my $node ($ns->list_online_nodes()->@*) {
+        next if $self->{online_node_usage}->contains_node($node);
+
+        $self->{online_node_usage}->add_node($node);
     }
-
-    for my $sid (sort keys $self->{ss}->%*) {
-        my $sd = $self->{ss}->{$sid};
-        my $used_nodes = PVE::HA::Tools::get_used_service_nodes($sd, $online_nodes);
-        my ($current, $target) = $used_nodes->@{qw(current target)};
-
-        $online_node_usage->add_service_usage_to_node($current, $sid, $sd->{node}, $sd->{target})
-            if $current;
-        $online_node_usage->add_service_usage_to_node($target, $sid, $sd->{node}, $sd->{target})
-            if $target;
-    }
-
-    $self->{online_node_usage} = $online_node_usage;
 }
 
 my $change_service_state = sub {
@@ -693,6 +697,8 @@ sub manage {
 
     $self->{groups} = $haenv->read_group_config(); # update
 
+    $self->recompute_online_node_usage();
+
     # compute new service status
 
     # add new service
@@ -704,11 +710,13 @@ sub manage {
         $haenv->log('info', "adding new service '$sid' on node '$cd->{node}'");
         # assume we are running to avoid relocate running service at add
         my $state = ($cd->{state} eq 'started') ? 'request_start' : 'request_stop';
-        $ss->{$sid} = {
+        my $sd = $ss->{$sid} = {
             state => $state,
             node => $cd->{node},
             uid => compute_new_uuid('started'),
         };
+
+        $self->add_service_usage($sid, $sd);
     }
 
     # remove stale or ignored services from manager state
@@ -718,12 +726,12 @@ sub manage {
         my $reason = defined($sc->{$sid}) ? 'ignored state requested' : 'no config';
         $haenv->log('info', "removing stale service '$sid' ($reason)");
 
+        $self->{online_node_usage}->remove_service_usage($sid);
+
         # remove all service related state information
         delete $ss->{$sid};
     }
 
-    $self->recompute_online_node_usage();
-
     my $new_rules = $haenv->read_rules_config();
 
     # TODO PVE 10: Remove group migration when HA groups have been fully migrated to rules
-- 
2.47.3





More information about the pve-devel mailing list