[pve-devel] [RFC v2/2 manager] pvestatd: cpu utilization based rebalancing

Thu Oct 20 13:43:53 CEST 2016

---
 PVE/Service/pvestatd.pm | 335 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 332 insertions(+), 3 deletions(-)

diff --git a/PVE/Service/pvestatd.pm b/PVE/Service/pvestatd.pm
index 98e5844..6164546 100755
--- a/PVE/Service/pvestatd.pm
+++ b/PVE/Service/pvestatd.pm
@@ -3,6 +3,8 @@ package PVE::Service::pvestatd;
 use strict;
 use warnings;
 
+use POSIX qw(floor);
+
 use PVE::SafeSyslog;
 use PVE::Daemon;
 
@@ -15,6 +17,7 @@ use PVE::Cluster qw(cfs_read_file);
 use PVE::Storage;
 use PVE::QemuServer;
 use PVE::LXC;
+use PVE::LXC::Config;
 use PVE::RPCEnvironment;
 use PVE::API2::Subscription;
 use PVE::AutoBalloon;
@@ -221,7 +224,7 @@ sub remove_stale_lxc_consoles {
 }
 
 sub update_lxc_status {
-    my ($status_cfg) = @_;
+    my ($status_cfg, $state) = @_;
 
     my $ctime = time();
 
@@ -253,6 +256,8 @@ sub update_lxc_status {
 	    $plugin->update_lxc_status($plugin_config, $vmid, $d, $ctime);
 	}
     }
+
+    rebalance($vmstatus, $state);
 }
 
 sub update_storage_status {
@@ -282,7 +287,326 @@ sub update_storage_status {
     }
 }
 
+# FIXME: already in QemuServer (but for semicolon-separated sets), move to Tools
+sub parse_number_sets {
+    my ($set, $re) = @_;
+    my $res = [];
+    $re = qr/;/ if !defined($re);
+    foreach my $part (split($re, $set)) {
+	if ($part =~ /^\s*(\d+)(?:-(\d+))?\s*$/) {
+	    die "invalid range: $part ($2 < $1)\n" if defined($2) && $2 < $1;
+	    push @$res, [$1, $2];
+	} else {
+	    die "invalid range: $part\n";
+	}
+    }
+    return $res;
+}
+
+sub number_setlist_to_list {
+    my ($setlist) = @_;
+    return map { $_->[0] .. ($_->[1]//$_->[0]) } @$setlist;
+}
+
+sub number_list_to_mask {
+    my ($list, $prealloc) = @_;
+    # preallocate a good estimated minimum number of zeroes
+    my $mask = [(0) x $prealloc];
+    foreach my $id (@$list) {
+	push @$mask, 0 while $id >= @$mask;
+	$mask->[$id] = 1;
+    }
+    return $mask;
+}
+
+sub get_cpusets {
+    my ($cgroup, $kind) = @_;
+    $kind = 'cpus' if !defined($kind);
+    my $set_text = PVE::Tools::file_read_firstline(
+	"/sys/fs/cgroup/cpuset/$cgroup/cpuset.$kind");
+    return parse_number_sets($set_text, qr/,/);
+}
+
+sub get_cpuacct {
+    my ($cgroup) = @_;
+    my $usage_text = PVE::Tools::file_read_firstline(
+	"/sys/fs/cgroup/cpuacct/$cgroup/cpuacct.usage_percpu");
+    return [split(/\s+/, $usage_text)];
+}
+
+# subtract b from a, not expanding a if b is longer, asssuming non-existing
+# elements in b are equal to a (=> b may be undef or contain undef values)
+my $subtract_list_safe = sub {
+    my ($a, $b) = @_;
+    return [(0) x scalar(@$a)] if !$b;
+    return [map { $b->[$_] ? ($a->[$_] - $b->[$_]) : 0 } (0..@$a-1)];
+};
+
+# FIXME: Candidate for PVE/LXC.pm?
+sub has_lxc_entry {
+    my ($conf, $keyname) = @_;
+    foreach my $entry (@{$conf->{lxc}}) {
+	my ($key, undef) = @$entry;
+	return 1 if $key eq $keyname;
+    }
+    return 0;
+}
+
+sub apply_cpumask {
+    my ($vmid, $mask, $curmask) = @_;
+    my $value = '';
+    my $changed = !$curmask;
+
+    for (my $id = 0; $id != @$mask; ++$id) {
+	if (!$mask->[$id]) {
+	    $changed = 1 if !$changed && ($id < @$curmask && $curmask->[$id]);
+	    next;
+	}
+	$changed = 1 if !$changed && ($id >= @$curmask || !$curmask->[$id]);
+	$value .= ',' if length($value);
+	$value .= $id;
+    }
+    if (!$changed && $curmask) {
+	for (my $id = @$mask; $id < @$curmask; ++$id) {
+	    if ($curmask->[$id]) {
+		$changed = 1;
+		last;
+	    }
+	}
+    }
+    return if !$changed;
+    open(my $fh, '>', "/sys/fs/cgroup/cpuset/lxc/$vmid/cpuset.cpus")
+	or die "failed to open cpuset for $vmid: $!\n";
+    print {$fh} "$value\n";
+    close($fh);
+}
+
+sub gather_ct_states_and_hotplug_limits {
+    my ($state, $vmstatus, $all_cpus) = @_;
+
+    my $ct_times = $state->{ct_times};
+
+    # This is the data we'll be filling
+    my $cts_per_cpu = [];
+    my $ct_cpu_utilization = {};
+    my $ct_cpumasks = {};
+
+    my $cpucount = scalar(@$all_cpus);
+    my $max_cpuid = $all_cpus->[-1];
+
+    foreach my $vmid (keys %$vmstatus) {
+	my $d = $vmstatus->{$vmid};
+	next if !$d->{pid}; # only active containers are of interest to us
+	my $conf = eval { PVE::LXC::Config->load_config($vmid) };
+	if ($@) {
+	    warn $@;
+	    next;
+	}
+
+	# ignore containers pinned to specific cpus
+	next if has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus');
+
+	my $cpulimit = $conf->{cpulimit} || $cpucount;
+
+	# Containers which don't take part in balancing still need to be checked
+	# for hotplugged changes, so we set the $no_balancing flag if the
+	# cpulimit equals the host cpucount instead of skpping it right away.
+	# (eg. It may have had a limit before)
+	my $no_balancing;
+	$no_balancing = 1 if $cpulimit == $cpucount;
+
+	# FIXME: add this when it's available in the container's JSON schema.
+	#my $use_cpusets = $conf->{use_cpusets};
+	#my $was_balancing = $state->{ct_use_cpusets}->{$vmid};
+	#$state->{ct_use_cpusets}->{$vmid} = $use_cpusets;
+	#$no_balancing = 1 if !$use_cpusets;
+
+	# get the current cpuset:
+	my $cpu_setlist = get_cpusets("lxc/$vmid");
+	my @cpu_list = number_setlist_to_list($cpu_setlist);
+	my $curmask = number_list_to_mask(\@cpu_list, $cpu_list[-1]);
+
+	# see if the cpulimit was hot-reduced or hasn't been enacted at all yet
+	my $newmask;
+	if ($cpulimit < @cpu_list) {
+	    splice(@cpu_list, $cpulimit);
+	    $newmask = number_list_to_mask(\@cpu_list, $cpu_list[-1]);
+	} elsif ($cpulimit > @cpu_list) {
+	    $newmask = [@$curmask];
+	    my $count = scalar(@cpu_list);
+	    foreach my $cpu (@$all_cpus) {
+		if (!$newmask->[$cpu]) {
+		    $newmask->[$cpu] = 1;
+		    push @cpu_list, $cpu; # for later
+
+		    ++$count;
+		    last if $count == $cpulimit;
+		}
+	    }
+	} else {
+	    $newmask = [@$curmask];
+	}
+
+	# Apply hot-plugged changes if any:
+	apply_cpumask($vmid, $newmask, $curmask);
+	next if $no_balancing;
+
+	# add to the set of containers per cpu
+	foreach my $cpu (@cpu_list) {
+	    push @$cts_per_cpu, [] while $cpu >= @$cts_per_cpu;
+	    push @{$cts_per_cpu->[$cpu]}, $vmid;
+	}
+
+	# add the cpu mask
+	$ct_cpumasks->{$vmid} = $newmask;
+
+	# gather cpu utilization data
+	my $ct_last = $ct_times->{$vmid};
+	my $ct_now = get_cpuacct("lxc/$vmid");
+	$ct_times->{$vmid} = $ct_now;
+	my $ct_diff = &$subtract_list_safe($ct_now, $ct_last);
+	$ct_cpu_utilization->{$vmid} = $ct_diff;
+    }
+
+    return ($cts_per_cpu, $ct_cpu_utilization, $ct_cpumasks);
+}
+
+sub rebalance {
+    my ($vmstatus, $state) = @_;
+
+    return if !-d '/sys/fs/cgroup/cpuset/lxc'; # nothing to do...
+
+    my $time_now = gettimeofday();
+    my $time_last = $state->{time};
+    $state->{time} = $time_now;
+
+    # FIXME: Timer::HiRes must have something better for this purpose?
+    my $time_diff_ns = defined($time_last) ? floor(($time_now - $time_last) * 1000000000) : 0;
+
+    my $min = sub { $_[1] < $_[0] ? $_[1] : $_[0] };
+    # Get cpu times of the root and lxc cgroups, limit them to the above diff
+    # to avoid negative values.
+    my $root_last = $state->{root_times};
+    my $root_now = get_cpuacct('');
+    $state->{root_times} = $root_now;
+    my $root_diff = &$subtract_list_safe($root_now, $root_last);
+    $root_diff = [ map { &$min($_, $time_diff_ns) } @$root_diff ];
+    # lxc cgroup as a whole
+    my $lxc_last = $state->{lxc_times};
+    my $lxc_now = get_cpuacct('lxc');
+    $state->{lxc_times} = $lxc_now;
+    my $lxc_diff = &$subtract_list_safe($lxc_now, $lxc_last);
+    $lxc_diff = [ map { &$min($_, $time_diff_ns) } @$lxc_diff ];
+
+    # Get a list of available CPUs
+    # (and transform the array of ranges to a flat array)
+    my $all_cpus = [number_setlist_to_list(get_cpusets('lxc', 'effective_cpus'))];
+
+    # Update the container times and count the containers on our cores.
+    my ($cts_per_cpu, $ct_diff, $ct_cpumask) =
+	gather_ct_states_and_hotplug_limits($state, $vmstatus, $all_cpus);
+
+    # On the first run we only collect data as we have no way of getting the
+    # actual usage now.
+    return if !$time_diff_ns;
+
+    # Get total cpu utilization in range 0..1
+    my $total_time = 0;
+    $total_time += $root_diff->[$_] foreach @$all_cpus;
+    my $total_usage = $total_time / ($time_diff_ns * scalar(@$all_cpus));
+
+    if ($total_usage > 0.9) {
+	# All cores are heavily utilized, there's no point in rescheduling.
+	#debugf("Total usage: %1.2f\n", $total_usage);
+	return;
+    }
+
+    my $timep80 = $time_diff_ns * 0.8;
+
+    my @cpus_by_usage = sort {
+	$root_diff->[$b] <=> $root_diff->[$a]
+    } @$all_cpus;
+    my @free_cpus = reverse @cpus_by_usage;
+
+    my @balanced_root_diff = @$root_diff;
+
+    my %rebalanced;
+
+    my $inv_ns_diff = 1.0 / $time_diff_ns; # divisions hurt
+    foreach my $cpu (@cpus_by_usage) {
+	my $cpu_diff = $root_diff->[$cpu];
+	# if this core isn't busy enough we can stop here, since we're going
+	# through cores sorted by their utilization
+	my $usage = $cpu_diff * $inv_ns_diff;
+	last if $usage <= 0.8 || $usage <= $total_usage;
+
+	#debugf("CPU %i usage: %1.2f\n", $cpu, $usage);
+
+	# If most of this core's utilization (>90%) comes from the host, we
+	# don't do anything either.
+	my $lxc_fraction = $lxc_diff->[$cpu] / $cpu_diff;
+	#debugf("   (CPU %i is used by host)\n", $cpu) if $lxc_fraction < 0.1;
+	next if $lxc_fraction < 0.1;
+
+	# Of course, if there's no container on this core, move on.
+	my $cts = $cts_per_cpu->[$cpu];
+	next if !defined($cts) || !@$cts;
+
+	# Here comes the heavy lifting:
+	#debugf("  %i containers to balance away from cpu %i\n", scalar(@$cts), $cpu);
+	my @cts_by_usage = sort { $ct_diff->{$a}->[$cpu] <=> $ct_diff->{$b}->[$cpu] } @$cts;
+	my $orig_cpudiff = $cpu_diff;
+	while (@cts_by_usage && $cpu_diff > $timep80) {
+	    my $ct = shift @cts_by_usage;
+	    my $diff = $ct_diff->{$ct}->[$cpu];
+
+	    # If this container is responsible for (almost) all of this core's
+	    # load, moving it would just _shift_ the load, not _balance_ it.
+	    # Also, since they're sorted by usage, it means that this is the
+	    # last container in the list, otherwise their total usage would
+	    # sum up to more than 100%.
+	    my $fraction = $diff / $orig_cpudiff;
+	    #debugf("    Container %i is the only user of cpu %s\n", $ct, $cpu);
+	    #debugf("    your math doesn't add up\n") if $fraction > 0.9 && @cts_by_usage;
+	    next if $fraction > 0.9;
+
+	    # This core's getting too crowded for me, I'm outta here!
+	    # Find a new home:
+	    my $newcpu;
+	    my $cpumask = $ct_cpumask->{$ct};
+	    foreach my $candidate (@free_cpus) {
+		if (!$cpumask->[$candidate]) {
+		    $newcpu = $candidate;
+		    last;
+		}
+	    }
+	    if (!defined($newcpu) || $newcpu == $cpu ||
+		($balanced_root_diff[$newcpu] + $diff) > $timep80)
+	    {
+		#debugf("    CT %s too busy at core %i\n", $ct, $cpu);
+	    } else {
+		#debugf("    CT %s core %i => %i\n", $ct, $cpu, $newcpu);
+		$balanced_root_diff[$cpu] -= $diff;
+		$balanced_root_diff[$newcpu] += $diff;
+		$cpumask->[$cpu] = 0;
+		$cpumask->[$newcpu] = 1;
+		$cpu_diff -= $diff;
+		$rebalanced{$ct} = 1;
+		# Re-sort
+		@free_cpus = sort {
+		    $balanced_root_diff[$a] <=> $balanced_root_diff[$b]
+		} @free_cpus;
+		#debugf("      %s + %s ((%s))\n", $cpu_diff, $diff, $timep80);
+	    }
+	}
+    }
+    foreach my $ct (keys %rebalanced) {
+	apply_cpumask($ct, $ct_cpumask->{$ct}, undef);
+    }
+}
+
 sub update_status {
+    my ($state) = @_;
 
     # update worker list. This is not really required and
     # we just call this to make sure that we have a correct
@@ -309,7 +633,7 @@ sub update_status {
     syslog('err', "qemu status update error: $err") if $err;
 
     eval {
-	update_lxc_status($status_cfg);
+	update_lxc_status($status_cfg, $state);
     };
     $err = $@;
     syslog('err', "lxc status update error: $err") if $err;
@@ -339,6 +663,11 @@ my $initial_memory_usage;
 sub run {
     my ($self) = @_;
 
+    my $state = {
+	ct_times => {},
+	# ct_use_cpusets => {},
+    };
+
     for (;;) { # forever
 
  	$next_update = time() + $updatetime;
@@ -348,7 +677,7 @@ sub run {
 	    eval {
 		# syslog('info', "start status update");
 		PVE::Cluster::cfs_update();
-		update_status();
+		update_status($state);
 	    };
 	    my $err = $@;
 
-- 
2.1.4