[pve-devel] [RFC v1/2 manager] pvestatd: add simple container cpuset balancing
Wolfgang Bumiller
w.bumiller at proxmox.com
Thu Oct 20 13:43:52 CEST 2016
---
PVE/Service/pvestatd.pm | 171 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 171 insertions(+)
diff --git a/PVE/Service/pvestatd.pm b/PVE/Service/pvestatd.pm
index 98e5844..0d51574 100755
--- a/PVE/Service/pvestatd.pm
+++ b/PVE/Service/pvestatd.pm
@@ -15,6 +15,7 @@ use PVE::Cluster qw(cfs_read_file);
use PVE::Storage;
use PVE::QemuServer;
use PVE::LXC;
+use PVE::LXC::Config;
use PVE::RPCEnvironment;
use PVE::API2::Subscription;
use PVE::AutoBalloon;
@@ -253,6 +254,8 @@ sub update_lxc_status {
$plugin->update_lxc_status($plugin_config, $vmid, $d, $ctime);
}
}
+
+ rebalance($vmstatus);
}
sub update_storage_status {
@@ -282,6 +285,174 @@ sub update_storage_status {
}
}
+# FIXME: already in QemuServer (but for semicolon-separated sets), move to Tools
+sub parse_number_sets {
+ my ($set, $re) = @_;
+ my $res = [];
+ $re = qr/;/ if !defined($re);
+ foreach my $part (split($re, $set)) {
+ if ($part =~ /^\s*(\d+)(?:-(\d+))?\s*$/) {
+ die "invalid range: $part ($2 < $1)\n" if defined($2) && $2 < $1;
+ push @$res, [$1, $2];
+ } else {
+ die "invalid range: $part\n";
+ }
+ }
+ return $res;
+}
+
+sub number_setlist_to_list {
+ my ($setlist) = @_;
+ return map { $_->[0] .. ($_->[1]//$_->[0]) } @$setlist;
+}
+
+sub get_cpusets {
+ my ($cgroup, $kind) = @_;
+ $kind = 'cpus' if !defined($kind);
+ my $set_text = PVE::Tools::file_read_firstline(
+ "/sys/fs/cgroup/cpuset/$cgroup/cpuset.$kind");
+ return parse_number_sets($set_text, qr/,/);
+}
+
+# FIXME: Candidate for PVE/LXC.pm?
+sub has_lxc_entry {
+ my ($conf, $keyname) = @_;
+ foreach my $entry (@{$conf->{lxc}}) {
+ my ($key, undef) = @$entry;
+ return 1 if $key eq $keyname;
+ }
+ return 0;
+}
+
+sub apply_cpumask {
+ my ($vmid, $mask, $curmask) = @_;
+ my $value = '';
+ my $changed = !$curmask;
+
+ for (my $id = 0; $id != @$mask; ++$id) {
+ if (!$mask->[$id]) {
+ $changed = 1 if !$changed && ($id < @$curmask && $curmask->[$id]);
+ next;
+ }
+ $changed = 1 if !$changed && ($id >= @$curmask || !$curmask->[$id]);
+ $value .= ',' if length($value);
+ $value .= $id;
+ }
+ if (!$changed && $curmask) {
+ for (my $id = @$mask; $id < @$curmask; ++$id) {
+ if ($curmask->[$id]) {
+ $changed = 1;
+ last;
+ }
+ }
+ }
+ return if !$changed;
+ open(my $fh, '>', "/sys/fs/cgroup/cpuset/lxc/$vmid/cpuset.cpus")
+ or die "failed to open cpuset for $vmid: $!\n";
+ print {$fh} "$value\n";
+ close($fh);
+}
+
+sub rebalance {
+ my ($vmstatus) = @_;
+
+ return if !-d '/sys/fs/cgroup/cpuset/lxc'; # nothing to do...
+
+ my $cpu_setlist = get_cpusets('lxc', 'effective_cpus');
+ my @allowed_cpus = number_setlist_to_list($cpu_setlist);
+ my $cpucount = scalar(@allowed_cpus);
+ my $highest_cpuid = $allowed_cpus[-1];
+
+ my @cpu_ctcount = (0) x $highest_cpuid;
+ my @balanced_cts;
+
+ foreach my $vmid (sort keys %$vmstatus) {
+ my $d = $vmstatus->{$vmid};
+ next if !$d->{pid};
+
+ my $conf = eval { PVE::LXC::Config->load_config($vmid) };
+ if ($@) {
+ warn $@;
+ next;
+ }
+
+ # get the current cpuset:
+ my $cpu_setlist = get_cpusets("lxc/$vmid");
+ my $cpu_list = [number_setlist_to_list($cpu_setlist)];
+ $highest_cpuid = $cpu_list->[-1] if $highest_cpuid < $cpu_list->[-1];
+
+ # container has a fixed set, count it
+ if (has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus')) {
+ foreach my $cpu (@$cpu_list) {
+ $cpu_ctcount[$cpu]++ if $cpu <= @cpu_ctcount;
+ }
+ } else {
+ my $cpulimit = $conf->{cpulimit};
+ $cpulimit = $cpucount if !$cpulimit || $cpulimit > $cpucount;
+ push @balanced_cts, [$vmid, $cpulimit, $cpu_list];
+ }
+ }
+
+ my @cpus_by_count = sort { $cpu_ctcount[$a] <=> $cpu_ctcount[$b] }
+ @allowed_cpus;
+
+ foreach my $bct (@balanced_cts) {
+ my ($vmid, $cpulimit, $cpu_list) = @$bct;
+
+ # Get the currently active cpu mask:
+ my $curmask = [(0) x $highest_cpuid];
+ $curmask->[$_] = 1 foreach @$cpu_list;
+
+ # Get the desired new cpu mask:
+ my $mask = [(0) x $highest_cpuid];
+ my $i;
+ for ($i = 0; $i < $cpulimit && $i < @cpus_by_count; ++$i) {
+ my $cpu = $cpus_by_count[$i];
+ $mask->[$cpu] = 1;
+ $cpu_ctcount[$cpu]++;
+ }
+
+ apply_cpumask($vmid, $mask, $curmask);
+
+ # We need to keep cpus_by_count sorted:
+ # 1) Since cpus can only be used once the order does not need to be
+ # changed if we walked up to the last cpu in the sorted list:
+ next if $i >= @cpus_by_count;
+
+ my $lastcpu = $cpus_by_count[$i-1];
+ my $nextcpu = $cpus_by_count[$i];
+ my $count = $cpu_ctcount[$nextcpu];
+ # 2) If the next count is equal to the bumped-up count of the last cpu
+ # we assigned the container to, the order is still fine, too.
+ next if $count >= $cpu_ctcount[$lastcpu];
+
+ # 3) Find the range of cpus we need to sort forward. Under our
+ # conditions this translates to finding the next cpu with a different
+ # count (since they're sorted and adding even just 1 means we're equal
+ # to the last assigned cpu).
+ # (This should be a stable sort with respect to equally-utilized cpus)
+ my $from = $i;
+ ++$i;
+ while ($i < @cpus_by_count &&
+ $cpu_ctcount[$cpus_by_count[$i]] == $count) {
+ ++$i;
+ }
+ my $to = $i;
+
+ # 3) find the last cpu with a count lower than or equal to the first
+ # one we want to move:
+ $i = $from-1;
+ while ($i && $cpu_ctcount[$cpus_by_count[$i]] > $count) {
+ --$i;
+ }
+
+ # 4) Move:
+ my @range = (@cpus_by_count[$from..($to-1)]);
+ splice @cpus_by_count, $from, $to-$from;
+ unshift @cpus_by_count, @range;
+ }
+}
+
sub update_status {
# update worker list. This is not really required and
--
2.1.4
More information about the pve-devel
mailing list