[pve-devel] [RFC v2/2 manager] pvestatd: cpu utilization based rebalancing
Wolfgang Bumiller
w.bumiller at proxmox.com
Thu Oct 20 13:43:53 CEST 2016
---
PVE/Service/pvestatd.pm | 335 +++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 332 insertions(+), 3 deletions(-)
diff --git a/PVE/Service/pvestatd.pm b/PVE/Service/pvestatd.pm
index 98e5844..6164546 100755
--- a/PVE/Service/pvestatd.pm
+++ b/PVE/Service/pvestatd.pm
@@ -3,6 +3,8 @@ package PVE::Service::pvestatd;
use strict;
use warnings;
+use POSIX qw(floor);
+
use PVE::SafeSyslog;
use PVE::Daemon;
@@ -15,6 +17,7 @@ use PVE::Cluster qw(cfs_read_file);
use PVE::Storage;
use PVE::QemuServer;
use PVE::LXC;
+use PVE::LXC::Config;
use PVE::RPCEnvironment;
use PVE::API2::Subscription;
use PVE::AutoBalloon;
@@ -221,7 +224,7 @@ sub remove_stale_lxc_consoles {
}
sub update_lxc_status {
- my ($status_cfg) = @_;
+ my ($status_cfg, $state) = @_;
my $ctime = time();
@@ -253,6 +256,8 @@ sub update_lxc_status {
$plugin->update_lxc_status($plugin_config, $vmid, $d, $ctime);
}
}
+
+ rebalance($vmstatus, $state);
}
sub update_storage_status {
@@ -282,7 +287,326 @@ sub update_storage_status {
}
}
+# FIXME: already in QemuServer (but for semicolon-separated sets), move to Tools
+sub parse_number_sets {
+ my ($set, $re) = @_;
+ my $res = [];
+ $re = qr/;/ if !defined($re);
+ foreach my $part (split($re, $set)) {
+ if ($part =~ /^\s*(\d+)(?:-(\d+))?\s*$/) {
+ die "invalid range: $part ($2 < $1)\n" if defined($2) && $2 < $1;
+ push @$res, [$1, $2];
+ } else {
+ die "invalid range: $part\n";
+ }
+ }
+ return $res;
+}
+
+sub number_setlist_to_list {
+ my ($setlist) = @_;
+ return map { $_->[0] .. ($_->[1]//$_->[0]) } @$setlist;
+}
+
+sub number_list_to_mask {
+ my ($list, $prealloc) = @_;
+ # preallocate a good estimated minimum number of zeroes
+ my $mask = [(0) x $prealloc];
+ foreach my $id (@$list) {
+ push @$mask, 0 while $id >= @$mask;
+ $mask->[$id] = 1;
+ }
+ return $mask;
+}
+
+sub get_cpusets {
+ my ($cgroup, $kind) = @_;
+ $kind = 'cpus' if !defined($kind);
+ my $set_text = PVE::Tools::file_read_firstline(
+ "/sys/fs/cgroup/cpuset/$cgroup/cpuset.$kind");
+ return parse_number_sets($set_text, qr/,/);
+}
+
+sub get_cpuacct {
+ my ($cgroup) = @_;
+ my $usage_text = PVE::Tools::file_read_firstline(
+ "/sys/fs/cgroup/cpuacct/$cgroup/cpuacct.usage_percpu");
+ return [split(/\s+/, $usage_text)];
+}
+
+# subtract b from a, not expanding a if b is longer, asssuming non-existing
+# elements in b are equal to a (=> b may be undef or contain undef values)
+my $subtract_list_safe = sub {
+ my ($a, $b) = @_;
+ return [(0) x scalar(@$a)] if !$b;
+ return [map { $b->[$_] ? ($a->[$_] - $b->[$_]) : 0 } (0..@$a-1)];
+};
+
+# FIXME: Candidate for PVE/LXC.pm?
+sub has_lxc_entry {
+ my ($conf, $keyname) = @_;
+ foreach my $entry (@{$conf->{lxc}}) {
+ my ($key, undef) = @$entry;
+ return 1 if $key eq $keyname;
+ }
+ return 0;
+}
+
+sub apply_cpumask {
+ my ($vmid, $mask, $curmask) = @_;
+ my $value = '';
+ my $changed = !$curmask;
+
+ for (my $id = 0; $id != @$mask; ++$id) {
+ if (!$mask->[$id]) {
+ $changed = 1 if !$changed && ($id < @$curmask && $curmask->[$id]);
+ next;
+ }
+ $changed = 1 if !$changed && ($id >= @$curmask || !$curmask->[$id]);
+ $value .= ',' if length($value);
+ $value .= $id;
+ }
+ if (!$changed && $curmask) {
+ for (my $id = @$mask; $id < @$curmask; ++$id) {
+ if ($curmask->[$id]) {
+ $changed = 1;
+ last;
+ }
+ }
+ }
+ return if !$changed;
+ open(my $fh, '>', "/sys/fs/cgroup/cpuset/lxc/$vmid/cpuset.cpus")
+ or die "failed to open cpuset for $vmid: $!\n";
+ print {$fh} "$value\n";
+ close($fh);
+}
+
+sub gather_ct_states_and_hotplug_limits {
+ my ($state, $vmstatus, $all_cpus) = @_;
+
+ my $ct_times = $state->{ct_times};
+
+ # This is the data we'll be filling
+ my $cts_per_cpu = [];
+ my $ct_cpu_utilization = {};
+ my $ct_cpumasks = {};
+
+ my $cpucount = scalar(@$all_cpus);
+ my $max_cpuid = $all_cpus->[-1];
+
+ foreach my $vmid (keys %$vmstatus) {
+ my $d = $vmstatus->{$vmid};
+ next if !$d->{pid}; # only active containers are of interest to us
+ my $conf = eval { PVE::LXC::Config->load_config($vmid) };
+ if ($@) {
+ warn $@;
+ next;
+ }
+
+ # ignore containers pinned to specific cpus
+ next if has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus');
+
+ my $cpulimit = $conf->{cpulimit} || $cpucount;
+
+ # Containers which don't take part in balancing still need to be checked
+ # for hotplugged changes, so we set the $no_balancing flag if the
+ # cpulimit equals the host cpucount instead of skpping it right away.
+ # (eg. It may have had a limit before)
+ my $no_balancing;
+ $no_balancing = 1 if $cpulimit == $cpucount;
+
+ # FIXME: add this when it's available in the container's JSON schema.
+ #my $use_cpusets = $conf->{use_cpusets};
+ #my $was_balancing = $state->{ct_use_cpusets}->{$vmid};
+ #$state->{ct_use_cpusets}->{$vmid} = $use_cpusets;
+ #$no_balancing = 1 if !$use_cpusets;
+
+ # get the current cpuset:
+ my $cpu_setlist = get_cpusets("lxc/$vmid");
+ my @cpu_list = number_setlist_to_list($cpu_setlist);
+ my $curmask = number_list_to_mask(\@cpu_list, $cpu_list[-1]);
+
+ # see if the cpulimit was hot-reduced or hasn't been enacted at all yet
+ my $newmask;
+ if ($cpulimit < @cpu_list) {
+ splice(@cpu_list, $cpulimit);
+ $newmask = number_list_to_mask(\@cpu_list, $cpu_list[-1]);
+ } elsif ($cpulimit > @cpu_list) {
+ $newmask = [@$curmask];
+ my $count = scalar(@cpu_list);
+ foreach my $cpu (@$all_cpus) {
+ if (!$newmask->[$cpu]) {
+ $newmask->[$cpu] = 1;
+ push @cpu_list, $cpu; # for later
+
+ ++$count;
+ last if $count == $cpulimit;
+ }
+ }
+ } else {
+ $newmask = [@$curmask];
+ }
+
+ # Apply hot-plugged changes if any:
+ apply_cpumask($vmid, $newmask, $curmask);
+ next if $no_balancing;
+
+ # add to the set of containers per cpu
+ foreach my $cpu (@cpu_list) {
+ push @$cts_per_cpu, [] while $cpu >= @$cts_per_cpu;
+ push @{$cts_per_cpu->[$cpu]}, $vmid;
+ }
+
+ # add the cpu mask
+ $ct_cpumasks->{$vmid} = $newmask;
+
+ # gather cpu utilization data
+ my $ct_last = $ct_times->{$vmid};
+ my $ct_now = get_cpuacct("lxc/$vmid");
+ $ct_times->{$vmid} = $ct_now;
+ my $ct_diff = &$subtract_list_safe($ct_now, $ct_last);
+ $ct_cpu_utilization->{$vmid} = $ct_diff;
+ }
+
+ return ($cts_per_cpu, $ct_cpu_utilization, $ct_cpumasks);
+}
+
+sub rebalance {
+ my ($vmstatus, $state) = @_;
+
+ return if !-d '/sys/fs/cgroup/cpuset/lxc'; # nothing to do...
+
+ my $time_now = gettimeofday();
+ my $time_last = $state->{time};
+ $state->{time} = $time_now;
+
+ # FIXME: Timer::HiRes must have something better for this purpose?
+ my $time_diff_ns = defined($time_last) ? floor(($time_now - $time_last) * 1000000000) : 0;
+
+ my $min = sub { $_[1] < $_[0] ? $_[1] : $_[0] };
+ # Get cpu times of the root and lxc cgroups, limit them to the above diff
+ # to avoid negative values.
+ my $root_last = $state->{root_times};
+ my $root_now = get_cpuacct('');
+ $state->{root_times} = $root_now;
+ my $root_diff = &$subtract_list_safe($root_now, $root_last);
+ $root_diff = [ map { &$min($_, $time_diff_ns) } @$root_diff ];
+ # lxc cgroup as a whole
+ my $lxc_last = $state->{lxc_times};
+ my $lxc_now = get_cpuacct('lxc');
+ $state->{lxc_times} = $lxc_now;
+ my $lxc_diff = &$subtract_list_safe($lxc_now, $lxc_last);
+ $lxc_diff = [ map { &$min($_, $time_diff_ns) } @$lxc_diff ];
+
+ # Get a list of available CPUs
+ # (and transform the array of ranges to a flat array)
+ my $all_cpus = [number_setlist_to_list(get_cpusets('lxc', 'effective_cpus'))];
+
+ # Update the container times and count the containers on our cores.
+ my ($cts_per_cpu, $ct_diff, $ct_cpumask) =
+ gather_ct_states_and_hotplug_limits($state, $vmstatus, $all_cpus);
+
+ # On the first run we only collect data as we have no way of getting the
+ # actual usage now.
+ return if !$time_diff_ns;
+
+ # Get total cpu utilization in range 0..1
+ my $total_time = 0;
+ $total_time += $root_diff->[$_] foreach @$all_cpus;
+ my $total_usage = $total_time / ($time_diff_ns * scalar(@$all_cpus));
+
+ if ($total_usage > 0.9) {
+ # All cores are heavily utilized, there's no point in rescheduling.
+ #debugf("Total usage: %1.2f\n", $total_usage);
+ return;
+ }
+
+ my $timep80 = $time_diff_ns * 0.8;
+
+ my @cpus_by_usage = sort {
+ $root_diff->[$b] <=> $root_diff->[$a]
+ } @$all_cpus;
+ my @free_cpus = reverse @cpus_by_usage;
+
+ my @balanced_root_diff = @$root_diff;
+
+ my %rebalanced;
+
+ my $inv_ns_diff = 1.0 / $time_diff_ns; # divisions hurt
+ foreach my $cpu (@cpus_by_usage) {
+ my $cpu_diff = $root_diff->[$cpu];
+ # if this core isn't busy enough we can stop here, since we're going
+ # through cores sorted by their utilization
+ my $usage = $cpu_diff * $inv_ns_diff;
+ last if $usage <= 0.8 || $usage <= $total_usage;
+
+ #debugf("CPU %i usage: %1.2f\n", $cpu, $usage);
+
+ # If most of this core's utilization (>90%) comes from the host, we
+ # don't do anything either.
+ my $lxc_fraction = $lxc_diff->[$cpu] / $cpu_diff;
+ #debugf(" (CPU %i is used by host)\n", $cpu) if $lxc_fraction < 0.1;
+ next if $lxc_fraction < 0.1;
+
+ # Of course, if there's no container on this core, move on.
+ my $cts = $cts_per_cpu->[$cpu];
+ next if !defined($cts) || !@$cts;
+
+ # Here comes the heavy lifting:
+ #debugf(" %i containers to balance away from cpu %i\n", scalar(@$cts), $cpu);
+ my @cts_by_usage = sort { $ct_diff->{$a}->[$cpu] <=> $ct_diff->{$b}->[$cpu] } @$cts;
+ my $orig_cpudiff = $cpu_diff;
+ while (@cts_by_usage && $cpu_diff > $timep80) {
+ my $ct = shift @cts_by_usage;
+ my $diff = $ct_diff->{$ct}->[$cpu];
+
+ # If this container is responsible for (almost) all of this core's
+ # load, moving it would just _shift_ the load, not _balance_ it.
+ # Also, since they're sorted by usage, it means that this is the
+ # last container in the list, otherwise their total usage would
+ # sum up to more than 100%.
+ my $fraction = $diff / $orig_cpudiff;
+ #debugf(" Container %i is the only user of cpu %s\n", $ct, $cpu);
+ #debugf(" your math doesn't add up\n") if $fraction > 0.9 && @cts_by_usage;
+ next if $fraction > 0.9;
+
+ # This core's getting too crowded for me, I'm outta here!
+ # Find a new home:
+ my $newcpu;
+ my $cpumask = $ct_cpumask->{$ct};
+ foreach my $candidate (@free_cpus) {
+ if (!$cpumask->[$candidate]) {
+ $newcpu = $candidate;
+ last;
+ }
+ }
+ if (!defined($newcpu) || $newcpu == $cpu ||
+ ($balanced_root_diff[$newcpu] + $diff) > $timep80)
+ {
+ #debugf(" CT %s too busy at core %i\n", $ct, $cpu);
+ } else {
+ #debugf(" CT %s core %i => %i\n", $ct, $cpu, $newcpu);
+ $balanced_root_diff[$cpu] -= $diff;
+ $balanced_root_diff[$newcpu] += $diff;
+ $cpumask->[$cpu] = 0;
+ $cpumask->[$newcpu] = 1;
+ $cpu_diff -= $diff;
+ $rebalanced{$ct} = 1;
+ # Re-sort
+ @free_cpus = sort {
+ $balanced_root_diff[$a] <=> $balanced_root_diff[$b]
+ } @free_cpus;
+ #debugf(" %s + %s ((%s))\n", $cpu_diff, $diff, $timep80);
+ }
+ }
+ }
+ foreach my $ct (keys %rebalanced) {
+ apply_cpumask($ct, $ct_cpumask->{$ct}, undef);
+ }
+}
+
sub update_status {
+ my ($state) = @_;
# update worker list. This is not really required and
# we just call this to make sure that we have a correct
@@ -309,7 +633,7 @@ sub update_status {
syslog('err', "qemu status update error: $err") if $err;
eval {
- update_lxc_status($status_cfg);
+ update_lxc_status($status_cfg, $state);
};
$err = $@;
syslog('err', "lxc status update error: $err") if $err;
@@ -339,6 +663,11 @@ my $initial_memory_usage;
sub run {
my ($self) = @_;
+ my $state = {
+ ct_times => {},
+ # ct_use_cpusets => {},
+ };
+
for (;;) { # forever
$next_update = time() + $updatetime;
@@ -348,7 +677,7 @@ sub run {
eval {
# syslog('info', "start status update");
PVE::Cluster::cfs_update();
- update_status();
+ update_status($state);
};
my $err = $@;
--
2.1.4
More information about the pve-devel
mailing list