[pve-devel] [PATCH pve-ha-manager 1/3] add ressource awareness manager
Alexandre Derumier
aderumier at odiso.com
Mon Dec 13 08:43:14 CET 2021
---
src/PVE/HA/Env.pm | 24 ++++
src/PVE/HA/Env/PVE2.pm | 90 ++++++++++++++
src/PVE/HA/Manager.pm | 246 ++++++++++++++++++++++++++++++++++++--
src/PVE/HA/Sim/TestEnv.pm | 27 +++++
4 files changed, 380 insertions(+), 7 deletions(-)
diff --git a/src/PVE/HA/Env.pm b/src/PVE/HA/Env.pm
index ac569a9..73b6407 100644
--- a/src/PVE/HA/Env.pm
+++ b/src/PVE/HA/Env.pm
@@ -269,4 +269,28 @@ sub get_ha_settings {
return $self->{plug}->get_ha_settings();
}
+sub get_node_rrd_stats {
+ my ($self, $node) = @_;
+
+ return $self->{plug}->get_node_rrd_stats($node);
+}
+
+sub get_vm_rrd_stats {
+ my ($self, $vmid, $percentile) = @_;
+
+ return $self->{plug}->get_vm_rrd_stats($vmid, $percentile);
+}
+
+sub read_vm_config {
+ my ($self, $vmid) = @_;
+
+ return $self->{plug}->read_vm_config($vmid);
+}
+
+sub read_ct_config {
+ my ($self, $vmid) = @_;
+
+ return $self->{plug}->read_ct_config($vmid);
+}
+
1;
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index 5e0a683..2e1585c 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -9,9 +9,14 @@ use IO::Socket::UNIX;
use PVE::SafeSyslog;
use PVE::Tools;
use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
+use PVE::Cluster;
use PVE::DataCenterConfig;
use PVE::INotify;
use PVE::RPCEnvironment;
+use PVE::API2Tools;
+use PVE::QemuConfig;
+use PVE::LXC::Config;
+use RRDs;
use PVE::HA::Tools ':exit_codes';
use PVE::HA::Env;
@@ -459,4 +464,89 @@ sub get_ha_settings {
return $datacenterconfig->{ha};
}
+sub get_node_rrd_stats {
+ my ($self, $node) = @_;
+
+ my $rrd = PVE::Cluster::rrd_dump();
+ my $members = PVE::Cluster::get_members();
+
+ my $stats = PVE::API2Tools::extract_node_stats($node, $members, $rrd);
+
+ return $stats;
+}
+
+sub get_vm_rrd_stats {
+ my ($self, $vmid, $percentile) = @_;
+
+ my $rrdname = "pve2-vm/$vmid";
+ my $rrddir = "/var/lib/rrdcached/db";
+
+ my $rrd = "$rrddir/$rrdname";
+
+ my $cf = "AVERAGE";
+
+ my $reso = 60;
+ my $ctime = $reso*int(time()/$reso);
+
+ #last 20minutes
+ my $req_start = $ctime - $reso*20;
+ my $req_end = $ctime - $reso*1;
+
+ my @args = (
+ "-s" => $req_start,
+ "-e" => $req_end,
+ "-r" => $reso,
+ );
+
+ my $socket = "/var/run/rrdcached.sock";
+ push @args, "--daemon" => "unix:$socket" if -S $socket;
+
+ my ($start, $step, $names, $data) = RRDs::fetch($rrd, $cf, @args);
+
+ my @cpu = ();
+ my @mem = ();
+ my @maxmem = ();
+ my @maxcpu = ();
+
+ foreach my $rec (@$data) {
+ my $maxcpu = @$rec[0] || 0;
+ my $cpu = @$rec[1] || 0;
+ my $maxmem = @$rec[2] || 0;
+ my $mem = @$rec[3] || 0;
+ #skip zeros values if vm is down
+ push @cpu, $cpu*$maxcpu if $cpu > 0;
+ push @mem, $mem if $mem > 0;
+ push @maxcpu, $maxcpu if $maxcpu > 0;
+ push @maxmem, $maxmem if $maxmem > 0;
+ }
+
+ my $stats = {};
+
+ $stats->{cpu} = percentile($percentile, \@cpu) || 0;
+ $stats->{mem} = percentile($percentile, \@mem) || 0;
+ $stats->{maxmem} = percentile($percentile, \@maxmem) || 0;
+ $stats->{maxcpu} = percentile($percentile, \@maxcpu) || 0;
+ $stats->{totalcpu} = $stats->{cpu} * $stats->{maxcpu} * 100;
+
+ return $stats;
+}
+
+sub percentile {
+ my ($p, $aref) = @_;
+ my $percentile = int($p * $#{$aref}/100);
+ return (sort @$aref)[$percentile];
+}
+
+sub read_vm_config {
+ my ($self, $vmid) = @_;
+
+ return PVE::QemuConfig->load_config($vmid);
+}
+
+sub read_ct_config {
+ my ($self, $vmid) = @_;
+
+ return PVE::LXC::Config->load_config($vmid);
+}
+
1;
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 1c66b43..ae5fbcb 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -1,8 +1,13 @@
+
package PVE::HA::Manager;
use strict;
use warnings;
use Digest::MD5 qw(md5_base64);
+use RRDs;
+use POSIX qw/ceil/;
+use PVE::API2Tools;
+use PVE::Storage;
use PVE::Tools;
use PVE::HA::Tools ':exit_codes';
@@ -394,8 +399,16 @@ sub manage {
my $repeat = 0;
$self->recompute_online_node_usage();
+ $self->recompute_online_node_stats();
- foreach my $sid (sort keys %$ss) {
+ $self->get_service_stats($ss);
+
+ foreach my $sid (
+ #ordering vm by size, bigger mem first then bigger cpu
+ #could be improved with bubblesearch heuristic
+ #https://www.cs.tufts.edu/~nr/cs257/archive/michael-mitzenmacher/bubblesearch.pdf
+ sort { $ss->{$a}->{stats}->{memg} <=> $ss->{$b}->{stats}->{memg} || $ss->{$a}->{stats}->{totalcpuround} <=> $ss->{$b}->{stats}->{totalcpuround} || $ss->{$a}->{type} cmp $ss->{$b}->{type}}
+ keys %$ss) {
my $sd = $ss->{$sid};
my $cd = $sc->{$sid} || { state => 'disabled' };
@@ -802,12 +815,8 @@ sub next_state_recovery {
$self->recompute_online_node_usage(); # we want the most current node state
- my $recovery_node = select_service_node(
- $self->{groups},
- $self->{online_node_usage},
- $cd,
- $sd->{node},
- );
+ my $storecfg = PVE::Storage::config();
+ my $recovery_node = find_bestfit_node_target($haenv, $sid, $cd , $sd->{node}, $sd->{stats}, $self->{online_node_usage}, $self->{online_node_stats}, $self->{groups}, $storecfg);
if ($recovery_node) {
my $msg = "recover service '$sid' from fenced node '$fenced_node' to node '$recovery_node'";
@@ -822,6 +831,14 @@ sub next_state_recovery {
$haenv->steal_service($sid, $sd->{node}, $recovery_node);
$self->{online_node_usage}->{$recovery_node}++;
+ #add vm cpu/mem to current node stats (this is an estimation based on last 20min vm stats)
+ my $node_stats = $self->{online_node_stats}->{$recovery_node}->{stats};
+ $node_stats->{totalcpu} += $sd->{stats}->{totalcpu};
+ $node_stats->{mem} += $sd->{stats}->{mem};
+ $node_stats->{totalfreecpu} = (100 * $node_stats->{maxcpu}) - $node_stats->{totalcpu};
+ $node_stats->{freemem} = $node_stats->{maxmem} - $node_stats->{mem};
+
+
# NOTE: $sd *is normally read-only*, fencing is the exception
$cd->{node} = $sd->{node} = $recovery_node;
my $new_state = ($cd->{state} eq 'started') ? 'started' : 'request_stop';
@@ -839,4 +856,219 @@ sub next_state_recovery {
}
}
+
+sub dotprod {
+ my($vec_a, $vec_b, $mode) = @_;
+ die "they must have the same size\n" unless @$vec_a == @$vec_b;
+ $mode = "" if !$mode;
+ my $sum = 0;
+ my $norm_a = 0;
+ my $norm_b = 0;
+
+ for(my $i=0; $i < scalar @{$vec_a}; $i++) {
+ my $a = @{$vec_a}[$i];
+ my $b = @{$vec_b}[$i];
+
+ $sum += $a * $b;
+ $norm_a += $a * $a;
+ $norm_b += $b * $b;
+ }
+
+ if($mode eq 'normR') {
+ return $sum / (sqrt($norm_a) * sqrt($norm_b))
+ } elsif ($mode eq 'normC') {
+ return $sum / $norm_b;
+ }
+ return $sum;
+}
+
+sub euclidean_distance {
+ my($vec_a, $vec_b) = @_;
+
+ my $sum = 0;
+
+ for(my $i=0; $i < scalar @{$vec_a}; $i++) {
+ my $a = @{$vec_a}[$i];
+ my $b = @{$vec_b}[$i];
+ $sum += ($b - $a)**2;
+ }
+
+ return sqrt($sum);
+}
+
+sub find_bestfit_node_target {
+ my($haenv, $sid, $cd, $nodename, $vm_stats, $online_node_usage, $online_nodes, $groups, $storecfg) = @_;
+
+ my (undef, $vmid) = split(/:/, $sid);
+
+ my $hagroup = get_service_group($groups, $online_nodes, $cd);
+ my ($pri_groups, $group_members_prio) = get_node_priority_groups($hagroup, $online_nodes);
+
+ my $target_nodes = {};
+ foreach my $nodename (keys %$online_nodes) {
+ my $node_stats = $online_nodes->{$nodename}->{stats};
+
+ #### FILTERING NODES WITH HARD CONSTRAINTS (vm can't be started)
+ next if !check_hard_constraints($haenv, $vmid, $cd, $nodename, $node_stats, $vm_stats, $storecfg, $group_members_prio);
+
+ #### ADD prio and euclidean_distance weight
+ $target_nodes->{$nodename} = add_node_prio($nodename, 'distance', $node_stats, $vm_stats, $group_members_prio, $online_node_usage);
+ }
+
+ #order by soft_constraint_prio, hagroup prio, weight (Best fit algorithm, lower distance first), number of services, and nodename
+ my @target_array = sort {
+ $target_nodes->{$b}->{prio} <=> $target_nodes->{$a}->{prio} ||
+ $target_nodes->{$a}->{soft_constraint_prio} <=> $target_nodes->{$b}->{soft_constraint_prio} ||
+ $target_nodes->{$a}->{weight} <=> $target_nodes->{$b}->{weight} ||
+ $target_nodes->{$a}->{online_node_usage} <=> $target_nodes->{$b}->{online_node_usage} ||
+ $target_nodes->{$a}->{name} cmp $target_nodes->{$b}->{name}
+ } keys %$target_nodes;
+
+ my $target = $target_array[0];
+
+ return $target;
+}
+
+
+sub check_hard_constraints {
+ my ($haenv, $vmid, $cd, $node, $node_stats, $vm_stats, $storecfg, $group_members_prio) = @_;
+
+ #node need to have a prio(restricted group)
+ return if !defined($group_members_prio->{$node});
+
+ #vm can't start if host have less core
+ return if $node_stats->{maxcpu} < $vm_stats->{maxcpu};
+ #vm can't start if node don't have enough mem to handle vm max mem
+ return if $node_stats->{freemem} < $vm_stats->{maxmem};
+
+ #max 95% cpu/ram
+ my $mem_threshold = 0.95;
+ my $cpu_threshold = 0.95;
+
+ #check if target node have enough mem ressources under threshold
+ return if $node_stats->{freemem} * $mem_threshold < $vm_stats->{mem};
+
+ #check if target node have enough cpu ressources under threshold
+ return if $node_stats->{totalfreecpu} * $cpu_threshold < $vm_stats->{totalcpu};
+
+ #check storage availability
+ if ($cd->{type} eq 'vm') {
+ my $conf = undef;
+ eval { $conf = $haenv->read_vm_config($vmid); };
+ if (!$@) {
+ eval { PVE::QemuServer::check_storage_availability($storecfg, $conf, $node) };
+ return if $@;
+ }
+
+ } elsif ($cd->{type} eq 'ct') {
+ my $conf = undef;
+ eval { $conf = $haenv->read_ct_config($vmid); };
+ #fixme : check storage for lxc too
+ }
+
+ # fixme: check bridge availability
+ # fixme: vm: add a check for cpumodel compatibility ?
+ return 1;
+}
+
+sub compute_soft_constraints {
+ my ($node_stats, $vm_stats) = @_;
+
+ #try to reach 80% max cpu/ram
+ my $mem_threshold = 0.8;
+ my $cpu_threshold = 0.8;
+
+ my $count = 0;
+ #check if target node have enough mem ressources under threshold
+ $count++ if $node_stats->{freemem} * $mem_threshold < $vm_stats->{mem};
+
+ #check if target node have enough cpu ressources under threshold
+ $count++ if $node_stats->{totalfreecpu} * $cpu_threshold < $vm_stats->{totalcpu};
+
+ #fixme : add antiaffinity
+
+ return $count;
+}
+
+sub add_node_prio {
+ my ($nodename, $method, $node_stats, $vm_stats, $group_members_prio, $online_node_usage) = @_;
+
+ #rounded values to compute vectors (cpu 0-100 , mem 0G-->XG)
+ my $vm_totalcpu = ceil($vm_stats->{totalcpu});
+ my $vm_mem = ceil($vm_stats->{mem}/1024/1024/1024);
+ my $node_freecpu = ceil($node_stats->{totalfreecpu});
+ my $node_freemem = ceil($node_stats->{freemem}/1024/1024/1024);
+
+ my @vec_vm = ($vm_totalcpu, $vm_mem); #? add network usage dimension ?
+ my @vec_node = ($node_freecpu, $node_freemem); #? add network usage dimension ?
+ my $weight = 0;
+ if ($method eq 'distance') {
+ $weight = euclidean_distance(\@vec_vm,\@vec_node);
+ } elsif ($method eq 'dotprod') {
+ $weight = dotprod(\@vec_vm,\@vec_node);
+ }
+
+ my $node = {};
+ $node->{weight} = $weight;
+ $node->{soft_constraint_prio} = compute_soft_constraints($node_stats, $vm_stats);
+ $node->{prio} = $group_members_prio->{$nodename};
+ $node->{online_node_usage} = $online_node_usage->{$nodename};
+ $node->{name} = $nodename;
+
+ return $node;
+}
+
+sub get_service_stats {
+ my ($self, $ss) = @_;
+
+ foreach my $sid (sort keys %$ss) {
+
+ if ($sid =~ m/^(vm|ct|fa):(\d+)$/) {
+ $ss->{$sid}->{type} = $1;
+ $ss->{$sid}->{name} = $2;
+ }
+
+ my $stats = {};
+ $stats->{cpu} = 0;
+ $stats->{maxcpu} = 0;
+ $stats->{mem} = 0;
+ $stats->{maxmem} = 0;
+
+ #avoid to compute all stats, as currently we only support recovery
+ if ($ss->{$sid}->{state} eq 'recovery') {
+
+ #get vm/ct stats 5min before on last 20min
+ $stats = $self->{haenv}->get_vm_rrd_stats($ss->{$sid}->{name}, 95);
+ }
+ #fixme: windows vm fill memory with zero at boot, so mem = maxmem
+
+ #rounded values for ordering
+ $stats->{totalcpuround} = ceil($stats->{cpu} * 100 * $stats->{maxcpu});
+ $stats->{memg} = ceil( $stats->{mem} /1024 /1024 /1024);
+
+ $ss->{$sid}->{stats} = $stats;
+ }
+}
+
+sub recompute_online_node_stats {
+ my ($self) = @_;
+
+ my $online_node_stats = {};
+ my $online_nodes = $self->{ns}->list_online_nodes();
+
+ foreach my $node (@$online_nodes) {
+ my $stats = $self->{haenv}->get_node_rrd_stats($node);
+ $stats->{cpu} = 0 if !defined($stats->{cpu});
+ $stats->{maxcpu} = 0 if !defined($stats->{maxcpu});
+ $stats->{mem} = 0 if !defined($stats->{mem});
+ $stats->{maxmem} = 0 if !defined($stats->{maxmem});
+ $stats->{totalcpu} = $stats->{cpu} * 100 * $stats->{maxcpu}; #how to handle different cpu model power ? bogomips ?
+ $stats->{totalfreecpu} = (100 * $stats->{maxcpu}) - $stats->{totalcpu};
+ $stats->{freemem} = $stats->{maxmem} - $stats->{mem};
+ $online_node_stats->{$node}->{stats} = $stats;
+ }
+
+ $self->{online_node_stats} = $online_node_stats;
+}
+
1;
diff --git a/src/PVE/HA/Sim/TestEnv.pm b/src/PVE/HA/Sim/TestEnv.pm
index 6718d8c..08f27c7 100644
--- a/src/PVE/HA/Sim/TestEnv.pm
+++ b/src/PVE/HA/Sim/TestEnv.pm
@@ -118,4 +118,31 @@ sub get_max_workers {
return 0;
}
+sub get_node_rrd_stats {
+ my ($self, $node) = @_;
+
+ my $stats = {};
+ $stats->{cpu} = 0;
+ $stats->{maxcpu} = 0;
+ $stats->{mem} = 0;
+ $stats->{maxmem} = 0;
+
+ return $stats;
+}
+
+sub get_vm_rrd_stats {
+ my ($self, $vmid, $percentile) = @_;
+
+ my $stats = {};
+
+ $stats->{cpu} = 0;
+ $stats->{mem} = 0;
+ $stats->{maxmem} = 0;
+ $stats->{maxcpu} = 0;
+ $stats->{cpu} = $stats->{cpu} * 100;
+ $stats->{totalcpu} = $stats->{cpu} * $stats->{maxcpu};
+
+ return $stats;
+}
+
1;
--
2.30.2
More information about the pve-devel
mailing list