[pve-devel] [PATCH v2 qemu-server 8/9] memory: add virtio-mem support
Alexandre Derumier
aderumier at odiso.com
Wed Jan 4 07:43:02 CET 2023
a 4GB static memory is needed for DMA+boot memory, as this memory
is almost always un-unpluggeable.
1 virtio-mem pci device is setup for each numa node on pci.4 bridge
virtio-mem use a fixed blocksize with 32000 blocks
Blocksize is computed from the maxmemory-4096/32000 with a minimum of
2MB to map THP.
(lower blocksize = more chance to unplug memory).
Note: Currently, linux only support 4MB virtio blocksize, 2MB support
is currently is progress.
For hotplug/unplug, we are try to allocate/unallocate same amount
of memory aligned to the blocksize on each numa node if possible.
If a node a not able to reach the target memory (could be an unmovable
page on unplug for example), we try again to redispatch memory the
remaining memory on others nodes.
About hugepages:
For ordinary memory devices, such as DIMMs, we preallocate memory via the
memory backend for such use cases; however, with virtio-mem we're dealing
with sparse memory backends; preallocating the whole memory backend
destroys the whole purpose of virtio-mem.
Instead, we want to preallocate memory when actually exposing memory to the
VM dynamically, and fail plugging memory gracefully + warn the user in case
preallocation fails.
fixes:
https://bugzilla.proxmox.com/show_bug.cgi?id=931
https://bugzilla.proxmox.com/show_bug.cgi?id=2949
---
Signed-off-by: Alexandre Derumier <aderumier at odiso.com>
---
PVE/API2/Qemu.pm | 10 +-
PVE/QemuServer.pm | 7 +-
PVE/QemuServer/Memory.pm | 233 ++++++++++++++++++++++++++++++++++++---
PVE/QemuServer/PCI.pm | 8 ++
4 files changed, 242 insertions(+), 16 deletions(-)
diff --git a/PVE/API2/Qemu.pm b/PVE/API2/Qemu.pm
index cab1e84..42941ac 100644
--- a/PVE/API2/Qemu.pm
+++ b/PVE/API2/Qemu.pm
@@ -32,7 +32,7 @@ use PVE::QemuServer::Drive;
use PVE::QemuServer::ImportDisk;
use PVE::QemuServer::Monitor qw(mon_cmd);
use PVE::QemuServer::Machine;
-use PVE::QemuServer::Memory qw(get_current_memory parse_memory get_host_max_mem);
+use PVE::QemuServer::Memory qw(get_current_memory parse_memory get_host_max_mem get_virtiomem_block_size);
use PVE::QemuMigrate;
use PVE::RPCEnvironment;
use PVE::AccessControl;
@@ -487,6 +487,14 @@ my $check_memory_param = sub {
if $mem->{max} > $host_max_mem;
}
+ #unplug works better with 128MB by dimm to match the linux blocksize btyes.
+ if ($mem->{virtio}) {
+ my $blocksize = get_virtiomem_block_size($conf);
+
+ die "memory need to be a multiple of $blocksize MB when virtiomem is enabled\n"
+ if $mem->{current} % $blocksize != 0;
+ }
+
if ($param->{memory} || defined($param->{balloon})) {
my $maxmem = undef;
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index 5847a78..51b29fc 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -3857,7 +3857,12 @@ sub config_to_command {
push @$cmd, get_cpu_options($conf, $arch, $kvm, $kvm_off, $machine_version, $winversion, $gpu_passthrough);
}
- PVE::QemuServer::Memory::config($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd);
+ my $mem_devices = {};
+ PVE::QemuServer::Memory::config($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd, $mem_devices);
+ foreach my $id (sort keys %$mem_devices) {
+ my $pciaddr = print_pci_addr($id, $bridges, $arch, $machine_type);
+ push @$devices, "-device", "$mem_devices->{$id}$pciaddr";
+ }
push @$cmd, '-S' if $conf->{freeze};
diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm
index b9136d2..6827004 100644
--- a/PVE/QemuServer/Memory.pm
+++ b/PVE/QemuServer/Memory.pm
@@ -3,6 +3,8 @@ package PVE::QemuServer::Memory;
use strict;
use warnings;
+use POSIX qw/ceil/;
+
use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach);
use PVE::Exception qw(raise raise_param_exc);
use PVE::GuestHelpers qw(safe_string_ne safe_num_ne safe_boolean_ne);
@@ -15,6 +17,7 @@ our @EXPORT_OK = qw(
get_current_memory
parse_memory
get_host_max_mem
+get_virtiomem_block_size
);
my $MAX_NUMA = 8;
@@ -37,6 +40,12 @@ my $memory_fmt = {
minimum => 65536,
maximum => 4194304
},
+ virtio => {
+ description => "enable virtio-mem memory",
+ type => 'boolean',
+ optional => 1,
+ default => 0,
+ }
};
sub print_memory {
@@ -69,7 +78,9 @@ my sub get_static_mem {
my $static_memory = 0;
my $memory = parse_memory($conf->{memory});
- if ($memory->{max}) {
+ if ($memory->{virtio}) {
+ $static_memory = 4096;
+ } elsif ($memory->{max}) {
my $dimm_size = $memory->{max} / $MAX_SLOTS;
#static mem can't be lower than 4G and lower than 1 dimmsize by socket
$static_memory = $dimm_size * $sockets;
@@ -160,6 +171,134 @@ sub get_current_memory {
return $memory->{current};
}
+sub get_virtiomem_block_size {
+ my ($conf) = @_;
+
+ my $MAX_MEM = get_max_mem($conf);
+ my $static_memory = get_static_mem($conf);
+ my $memory = get_current_memory($conf->{memory});
+
+ #virtiomem can map 32000 block size.
+ #try to use lowest blocksize, lower = more chance to unplug memory.
+ my $blocksize = ($MAX_MEM - $static_memory) / 32000;
+ #2MB is the minimum to be aligned with THP
+ $blocksize = 2**(ceil(log($blocksize)/log(2)));
+ $blocksize = 4 if $blocksize < 4;
+
+ return $blocksize;
+}
+
+my sub get_virtiomem_total_current {
+ my ($mems) = @_;
+ my $total = 0;
+ foreach my $id (keys %$mems) {
+ my $mem = $mems->{$id};
+ $total += $mem->{current};
+ }
+ return $total;
+}
+
+my sub get_virtiomem_total_noerror {
+ my ($mems) = @_;
+
+ my $total = 0;
+ foreach my $id (keys %$mems) {
+ my $mem = $mems->{$id};
+ next if $mem->{error};
+ $total++;
+ }
+ return $total;
+}
+
+my sub get_virtiomem_total_errors_size {
+ my ($mems) = @_;
+
+ my $size = 0;
+ foreach my $id (keys %$mems) {
+ my $mem = $mems->{$id};
+ next if !$mem->{error};
+ $size += $mem->{current};
+ }
+ return $size;
+}
+
+my sub balance_virtiomem {
+ my ($vmid, $virtiomems, $blocksize, $target_virtiomem_total) = @_;
+
+ my $virtiomem_total_noerror = get_virtiomem_total_noerror($virtiomems);
+
+ print"try to balance memory on $virtiomem_total_noerror remaining virtiomems\n";
+
+ die "error. no more available blocks in virtiomem to balance the remaining memory" if $target_virtiomem_total < 0;
+ die "error. No more available virtiomem to balance the remaining memory\n" if $virtiomem_total_noerror == 0;
+
+ my $virtiomem_target_aligned = int( $target_virtiomem_total / $virtiomem_total_noerror / $blocksize) * $blocksize;
+ my $virtiomem_target_remaining = $target_virtiomem_total - ($virtiomem_target_aligned * ($virtiomem_total_noerror-1));
+
+ my $i = 0;
+ foreach my $id (sort keys %$virtiomems) {
+ my $virtiomem = $virtiomems->{$id};
+ next if $virtiomem->{error};
+ $i++;
+ my $virtiomem_target = $i == $virtiomem_total_noerror ? $virtiomem_target_remaining : $virtiomem_target_aligned;
+ $virtiomem->{completed} = 0;
+ $virtiomem->{retry} = 0;
+ $virtiomem->{target} = $virtiomem_target;
+
+ print "virtiomem$id: set-requested-size : $virtiomem_target\n";
+ mon_cmd($vmid, 'qom-set', path => "/machine/peripheral/virtiomem$id", property => "requested-size", value => $virtiomem_target * 1024 * 1024);
+ }
+
+ while (1) {
+
+ sleep 1;
+ my $total_finished = 0;
+
+ foreach my $id (keys %$virtiomems) {
+
+ my $virtiomem = $virtiomems->{$id};
+
+ if ($virtiomem->{error} || $virtiomem->{completed}) {
+ $total_finished++;
+ next;
+ }
+
+ my $size = mon_cmd($vmid, 'qom-get', path => "/machine/peripheral/virtiomem$id", property => "size");
+ $virtiomem->{current} = $size / 1024 / 1024;
+ print"virtiomem$id: virtiomem->last: $virtiomem->{last} virtiomem->current: $virtiomem->{current} virtio_mem_target:$virtiomem->{target}\n";
+
+ if($virtiomem->{current} == $virtiomem->{target}) {
+ print"virtiomem$id: completed\n";
+ $virtiomem->{completed} = 1;
+ next;
+ }
+
+ if($virtiomem->{current} != $virtiomem->{last}) {
+ #if value has changed, but not yet completed
+ print "virtiomem$id: changed but don't not reach target yet\n";
+ $virtiomem->{retry} = 0;
+ $virtiomem->{last} = $virtiomem->{current};
+ next;
+ }
+
+ if($virtiomem->{retry} >= 5) {
+ print "virtiomem$id: too many retry. set error\n";
+ $virtiomem->{error} = 1;
+ #as change is async, we don't want that value change after the api call
+ eval {
+ mon_cmd($vmid, 'qom-set', path => "/machine/peripheral/virtiomem$id", property => "requested-size", value => $virtiomem->{current} * 1024 *1024);
+ };
+ }
+ print"virtiomem$id: increase retry: $virtiomem->{retry}\n";
+ $virtiomem->{retry}++;
+ }
+
+ my $nb_virtiomem = keys %$virtiomems;
+ print"total finished: $total_finished numberof virtiomem:$nb_virtiomem \n";
+ return if $total_finished == $nb_virtiomem;
+ }
+}
+
sub get_numa_node_list {
my ($conf) = @_;
my @numa_map;
@@ -266,7 +405,8 @@ sub qemu_memory_hotplug {
my $newmem = parse_memory($value);
# skip non hotpluggable value
- if (safe_num_ne($newmem->{max}, $oldmem->{max})) {
+ if (safe_num_ne($newmem->{max}, $oldmem->{max}) ||
+ safe_boolean_ne($newmem->{virtio}, $oldmem->{virtio})) {
die "skip\n";
}
@@ -284,7 +424,43 @@ sub qemu_memory_hotplug {
my $MAX_MEM = get_max_mem($conf);
die "you cannot add more memory than max mem $MAX_MEM MB!\n" if $memory > $MAX_MEM;
- if ($value > $memory) {
+ my $confmem = parse_memory($conf->{memory});
+
+ if ($confmem->{virtio}) {
+ my $blocksize = get_virtiomem_block_size($conf);
+
+ my $virtiomems = {};
+ for (my $i = 0; $i < $sockets; $i++) {
+ my $size = mon_cmd($vmid, 'qom-get', path => "/machine/peripheral/virtiomem$i", property => "size");
+ $size = $size / 1024 /1024;
+ $virtiomems->{$i} = {
+ current => $size,
+ last => $size,
+ error => 0,
+ completed => 0,
+ retry => 0
+ };
+ }
+
+ while (1) {
+
+ my $target_virtiomem_total = $value - $static_memory - get_virtiomem_total_errors_size($virtiomems);
+ my $err;
+ eval {
+ balance_virtiomem($vmid, $virtiomems, $blocksize, $target_virtiomem_total);
+ };
+ $err = $@ if $@;
+
+ my $current_memory = $static_memory + get_virtiomem_total_current($virtiomems);
+ $newmem->{current} = $current_memory;
+ $conf->{memory} = print_memory($newmem);
+ PVE::QemuConfig->write_config($vmid, $conf);
+
+ die $err if $err;
+ last if $current_memory == $value;
+ }
+
+ } elsif ($value > $memory) {
my $numa_hostmap;
@@ -382,7 +558,7 @@ sub qemu_dimm_list {
}
sub config {
- my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd) = @_;
+ my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd, $mem_devices) = @_;
my $memory = get_current_memory($conf->{memory});
@@ -401,7 +577,10 @@ sub config {
die "minimum memory must be ${static_memory}MB\n" if($memory < $static_memory);
my $confmem = parse_memory($conf->{memory});
my $slots = $confmem->{max} ? $MAX_SLOTS : 255;
- push @$cmd, '-m', "size=${static_memory},slots=$slots,maxmem=${MAX_MEM}M";
+ my $cmdstr = "size=${static_memory}";
+ $cmdstr .= ",slots=$slots" if !$confmem->{'virtio'};
+ $cmdstr .= ",maxmem=${MAX_MEM}M";
+ push @$cmd, '-m', $cmdstr;
} else {
push @$cmd, '-m', $static_memory;
@@ -471,29 +650,55 @@ sub config {
}
if ($hotplug_features->{memory}) {
- foreach_dimm($conf, $vmid, $memory, $sockets, sub {
- my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
- my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size);
+ my $confmem = parse_memory($conf->{memory});
- push @$cmd, "-object" , $mem_object;
- push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode";
+ if ($confmem->{'virtio'}) {
+ my $MAX_MEM = get_max_mem($conf);
+ my $node_maxmem = ($MAX_MEM - $static_memory) / $sockets;
+ my $node_mem = ($memory - $static_memory) / $sockets;
+ my $blocksize = get_virtiomem_block_size($conf);
- die "memory size ($memory) must be aligned to $dimm_size for hotplugging\n"
- if $current_size > $memory;
- });
+ for (my $i = 0; $i < $sockets; $i++) {
+
+ my $id = "virtiomem$i";
+ my $mem_object = print_mem_object($conf, "mem-$id", $node_maxmem);
+ push @$cmd, "-object" , "$mem_object,reserve=off";
+
+ my $mem_device = "virtio-mem-pci,block-size=${blocksize}M,requested-size=${node_mem}M,id=$id,memdev=mem-$id,node=$i";
+ $mem_device .= ",prealloc=on" if $conf->{hugepages};
+ $mem_devices->{$id} = $mem_device;
+ }
+ } else {
+
+ foreach_dimm($conf, $vmid, $memory, $sockets, sub {
+ my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
+
+ my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size);
+
+ push @$cmd, "-object" , $mem_object;
+ push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode";
+
+ die "memory size ($memory) must be aligned to $dimm_size for hotplugging\n"
+ if $current_size > $memory;
+ });
+ }
}
}
sub print_mem_object {
my ($conf, $id, $size) = @_;
+ my $confmem = parse_memory($conf->{memory});
+
if ($conf->{hugepages}) {
my $hugepages_size = hugepages_size($conf, $size);
my $path = hugepages_mount_path($hugepages_size);
- return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
+ my $object = "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on";
+ $object .= ",prealloc=yes" if !$confmem->{virtio};
+ return $object;
} else {
return "memory-backend-ram,id=$id,size=${size}M";
}
diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm
index a18b974..0187c74 100644
--- a/PVE/QemuServer/PCI.pm
+++ b/PVE/QemuServer/PCI.pm
@@ -249,6 +249,14 @@ sub get_pci_addr_map {
'scsihw2' => { bus => 4, addr => 1 },
'scsihw3' => { bus => 4, addr => 2 },
'scsihw4' => { bus => 4, addr => 3 },
+ 'virtiomem0' => { bus => 4, addr => 4 },
+ 'virtiomem1' => { bus => 4, addr => 5 },
+ 'virtiomem2' => { bus => 4, addr => 6 },
+ 'virtiomem3' => { bus => 4, addr => 7 },
+ 'virtiomem4' => { bus => 4, addr => 8 },
+ 'virtiomem5' => { bus => 4, addr => 9 },
+ 'virtiomem6' => { bus => 4, addr => 10 },
+ 'virtiomem7' => { bus => 4, addr => 11 },
} if !defined($pci_addr_map);
return $pci_addr_map;
}
--
2.30.2
More information about the pve-devel
mailing list