[pve-devel] [PATCH v6 qemu-server 08/10] memory: add virtio-mem support

Alexandre Derumier aderumier at odiso.com
Mon Jun 19 09:28:39 CEST 2023


a 4GiB static memory is needed for DMA+boot memory, as this memory
is almost always un-unpluggeable.

1 virtio-mem pci device is setup for each numa node on pci.4 bridge

virtio-mem use a fixed blocksize with 32000 blocks
Blocksize is computed from the maxmemory-4096/32000 with a minimum of
2MiB to map THP.
(lower blocksize = more chance to unplug memory).

Note: Currently, linux only support 4MiB virtio blocksize, 2MiB support
is currently is progress.

For hotplug/unplug, we are try to allocate/unallocate same amount
of memory aligned to the blocksize on each numa node if possible.
If a node a not able to reach the target memory (could be an unmovable
page on unplug for example), we try again to redispatch memory the
remaining memory on others nodes.

About hugepages:

For ordinary memory devices, such as DIMMs, we preallocate memory via the
memory backend for such use cases; however, with virtio-mem we're dealing
with sparse memory backends; preallocating the whole memory backend
destroys the whole purpose of virtio-mem.

Instead, we want to preallocate memory when actually exposing memory to the
VM dynamically, and fail plugging memory gracefully + warn the user in case
preallocation fails.

fixes:
https://bugzilla.proxmox.com/show_bug.cgi?id=2949
Signed-off-by: Alexandre Derumier <aderumier at odiso.com>
---
 PVE/API2/Qemu.pm         |   9 +-
 PVE/QemuServer.pm        |   8 +-
 PVE/QemuServer/Memory.pm | 219 ++++++++++++++++++++++++++++++++++++---
 PVE/QemuServer/PCI.pm    |   8 ++
 4 files changed, 226 insertions(+), 18 deletions(-)

diff --git a/PVE/API2/Qemu.pm b/PVE/API2/Qemu.pm
index f0b0dda..456ee97 100644
--- a/PVE/API2/Qemu.pm
+++ b/PVE/API2/Qemu.pm
@@ -32,7 +32,7 @@ use PVE::QemuServer::Drive;
 use PVE::QemuServer::ImportDisk;
 use PVE::QemuServer::Monitor qw(mon_cmd);
 use PVE::QemuServer::Machine;
-use PVE::QemuServer::Memory qw(get_current_memory parse_memory get_host_max_mem);
+use PVE::QemuServer::Memory qw(get_current_memory parse_memory get_host_max_mem get_virtiomem_block_size);
 use PVE::QemuServer::PCI;
 use PVE::QemuServer::USB;
 use PVE::QemuMigrate;
@@ -490,6 +490,13 @@ my $check_memory_param = sub {
     die "current memory value too large (must be smaller than max memory)\n"
 	if $mem->{max} && $mem->{current} > $mem->{max};
 
+    if ($mem->{virtio}) {
+	my $blocksize = get_virtiomem_block_size($conf);
+
+	die "memory need to be a multiple of $blocksize MiB when virtiomem is enabled\n"
+	    if $mem->{current} % $blocksize != 0;
+    }
+
     if ($param->{memory} || defined($param->{balloon})) {
 
 	my $memory = $param->{memory} || $conf->{pending}->{memory} || $conf->{memory};
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index 1257902..7e0ade2 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -3887,8 +3887,14 @@ sub config_to_command {
 	push @$cmd, get_cpu_options($conf, $arch, $kvm, $kvm_off, $machine_version, $winversion, $gpu_passthrough);
     }
 
+    my $mem_devices = {};
     PVE::QemuServer::Memory::config(
-	$conf, $vmid, $sockets, $cores, $hotplug_features->{memory}, $cmd);
+	$conf, $vmid, $sockets, $cores, $hotplug_features->{memory}, $cmd, $mem_devices);
+
+    foreach my $id (sort keys %$mem_devices) {
+	my $pciaddr = print_pci_addr($id, $bridges, $arch, $machine_type);
+	push @$devices, "-device", "$mem_devices->{$id}$pciaddr";
+    }
 
     push @$cmd, '-S' if $conf->{freeze};
 
diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm
index 0d64a5f..dcdf318 100644
--- a/PVE/QemuServer/Memory.pm
+++ b/PVE/QemuServer/Memory.pm
@@ -3,8 +3,10 @@ package PVE::QemuServer::Memory;
 use strict;
 use warnings;
 
+use POSIX qw(ceil);
+
 use PVE::Exception qw(raise raise_param_exc);
-use PVE::GuestHelpers qw(safe_num_ne);
+use PVE::GuestHelpers qw(safe_boolean_ne safe_num_ne);
 use PVE::JSONSchema;
 use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach);
 
@@ -16,6 +18,7 @@ our @EXPORT_OK = qw(
 get_current_memory
 parse_memory
 get_host_max_mem
+get_virtiomem_block_size
 );
 
 my $MAX_NUMA = 8;
@@ -38,6 +41,12 @@ our $memory_fmt = {
 	maximum => 4194304,
 	format => 'pve-qm-memory-max',
     },
+    virtio => {
+	description => "Use virtio-mem devices for hotplug (Experimental: Only works with Linux guest with kernel >= 5.10)",
+	type => 'boolean',
+	optional => 1,
+	default => 0,
+    },
 };
 
 PVE::JSONSchema::register_format('pve-qm-memory-max', \&verify_qm_memory_max);
@@ -74,7 +83,9 @@ my sub get_static_mem {
     my $static_memory = 0;
     my $memory = parse_memory($conf->{memory});
 
-    if ($memory->{max}) {
+    if ($memory->{virtio}) {
+	$static_memory = 4096;
+    } elsif ($memory->{max}) {
 	my $dimm_size = $memory->{max} / $MAX_SLOTS;
 	#static mem can't be lower than 4G and lower than 1 dimmsize by socket
 	$static_memory = $dimm_size * $sockets;
@@ -163,6 +174,121 @@ sub get_current_memory {
     return $memory->{current};
 }
 
+sub get_virtiomem_block_size {
+    my ($conf) = @_;
+
+    my $sockets = $conf->{sockets} || 1;
+    my $MAX_MEM = get_max_mem($conf);
+    my $static_memory = get_static_mem($conf, $sockets, 1);
+    my $memory = get_current_memory($conf->{memory});
+
+    #virtiomem can map 32000 block size.
+    #try to use lowest blocksize, lower = more chance to unplug memory.
+    my $blocksize = ($MAX_MEM - $static_memory) / 32000;
+    #2MB is the minimum to be aligned with THP
+    $blocksize = 2 if $blocksize < 2;
+    $blocksize = 2**(ceil(log($blocksize)/log(2)));
+    #Linux guest kernel only support 4MiB block currently (kernel <= 6.2)
+    $blocksize = 4 if $blocksize < 4;
+
+    return $blocksize;
+}
+
+my sub get_virtiomem_total_current_size {
+    my ($mems) = @_;
+    my $size = 0;
+    for my $mem (values %$mems) {
+	$size += $mem->{current};
+    }
+    return $size;
+}
+
+my sub balance_virtiomem {
+    my ($vmid, $virtiomems, $blocksize, $target_total) = @_;
+
+    my $nb_virtiomem = scalar(keys %$virtiomems);
+
+    print"try to balance memory on $nb_virtiomem virtiomems\n";
+
+    #if we can't share exactly the same amount, we add the remainder on last node
+    my $target_aligned = int( $target_total / $nb_virtiomem / $blocksize) * $blocksize;
+    my $target_remaining = $target_total - ($target_aligned * ($nb_virtiomem-1));
+
+    my $i = 0;
+    foreach my $id (sort keys %$virtiomems) {
+	my $virtiomem = $virtiomems->{$id};
+	$i++;
+	my $virtiomem_target = $i == $nb_virtiomem ? $target_remaining : $target_aligned;
+	$virtiomem->{completed} = 0;
+	$virtiomem->{retry} = 0;
+	$virtiomem->{target} = $virtiomem_target;
+
+	print "virtiomem$id: set-requested-size : $virtiomem_target\n";
+	mon_cmd(
+	    $vmid,
+	    'qom-set',
+	    path => "/machine/peripheral/virtiomem$id",
+	    property => "requested-size",
+	    value => $virtiomem_target * 1024 * 1024
+	);
+    }
+
+    my $total_finished = 0;
+    my $error = undef;
+
+    while ($total_finished != $nb_virtiomem) {
+
+	sleep 1;
+
+	$total_finished = 0;
+
+	foreach my $id (keys %$virtiomems) {
+
+	    my $virtiomem = $virtiomems->{$id};
+
+	    if ($virtiomem->{error} || $virtiomem->{completed}) {
+		$total_finished++;
+		next;
+	    }
+
+	    my $size = mon_cmd($vmid, 'qom-get', path => "/machine/peripheral/virtiomem$id", property => "size");
+	    $virtiomem->{current} = $size / 1024 / 1024;
+	    print"virtiomem$id: last: $virtiomem->{last} current: $virtiomem->{current} target: $virtiomem->{target} retry: $virtiomem->{retry}\n";
+
+	    if($virtiomem->{current} == $virtiomem->{target}) {
+		print"virtiomem$id: completed\n";
+		$virtiomem->{completed} = 1;
+		next;
+	    }
+
+	    if($virtiomem->{current} != $virtiomem->{last}) {
+		#if value has changed, but not yet completed
+		$virtiomem->{retry} = 0;
+		$virtiomem->{last} = $virtiomem->{current};
+		next;
+	    }
+
+	    if($virtiomem->{retry} >= 5) {
+		print "virtiomem$id: target memory still not reached, ignoring device from now on\n";
+		$virtiomem->{error} = 1;
+		$error = 1;
+		#as change is async, we don't want that value change after the api call
+		eval {
+		    mon_cmd(
+			$vmid,
+			'qom-set',
+			path => "/machine/peripheral/virtiomem$id",
+			property => "requested-size",
+			value => $virtiomem->{current} * 1024 *1024
+		    );
+		};
+	    }
+	    $virtiomem->{retry}++;
+	}
+    }
+    die "No more available blocks in virtiomem to balance all requested memory\n" if $error;
+}
+
 sub get_numa_node_list {
     my ($conf) = @_;
     my @numa_map;
@@ -249,7 +375,37 @@ sub qemu_memory_hotplug {
     my $MAX_MEM = get_max_mem($conf);
     die "you cannot add more memory than max mem $MAX_MEM MB!\n" if $value > $MAX_MEM;
 
-    if ($value > $memory) {
+    if ($oldmem->{virtio}) {
+	my $blocksize = get_virtiomem_block_size($conf);
+
+	my $virtiomems = {};
+
+	for (my $i = 0; $i < $sockets; $i++) {
+	    my $size = mon_cmd($vmid, 'qom-get', path => "/machine/peripheral/virtiomem$i", property => "size");
+	    $size = $size / 1024 /1024;
+	    $virtiomems->{$i} = {
+		current => $size,
+		last => $size,
+		error => 0,
+		completed => 0,
+		retry => 0
+	    };
+	}
+
+	my $target_total = $value - $static_memory;
+	my $err;
+	eval {
+	    balance_virtiomem($vmid, $virtiomems, $blocksize, $target_total);
+	};
+	$err = $@ if $@;
+
+	my $current_memory = $static_memory + get_virtiomem_total_current_size($virtiomems);
+	$newmem->{current} = $current_memory;
+	$conf->{memory} = print_memory($newmem);
+	PVE::QemuConfig->write_config($vmid, $conf);
+	die $err if $err;
+
+    } elsif ($value > $memory) {
 
 	my $numa_hostmap;
 
@@ -342,8 +498,8 @@ sub can_hotplug {
     my $oldmem = parse_memory($conf->{memory});
     my $newmem = parse_memory($value);
 
-    return if safe_num_ne($newmem->{max}, $oldmem->{max});
-
+    return if (safe_num_ne($newmem->{max}, $oldmem->{max}) ||
+	       safe_boolean_ne($newmem->{virtio}, $oldmem->{virtio}));
     return 1;
 }
 
@@ -365,7 +521,7 @@ sub qemu_memdevices_list {
 }
 
 sub config {
-    my ($conf, $vmid, $sockets, $cores, $hotplug, $cmd) = @_;
+    my ($conf, $vmid, $sockets, $cores, $hotplug, $cmd, $mem_devices) = @_;
 
     my $memory = get_current_memory($conf->{memory});
     my $static_memory = get_static_mem($conf, $sockets, $hotplug);
@@ -383,7 +539,10 @@ sub config {
 	die "minimum memory must be ${static_memory}MB\n" if($memory < $static_memory);
 	my $confmem = parse_memory($conf->{memory});
 	my $slots = $confmem->{max} ? $MAX_SLOTS : 255;
-	push @$cmd, '-m', "size=${static_memory},slots=$slots,maxmem=${MAX_MEM}M";
+	my $cmdstr = "size=${static_memory}";
+	$cmdstr .= ",slots=$slots" if !$confmem->{'virtio'};
+	$cmdstr .= ",maxmem=${MAX_MEM}M";
+	push @$cmd, '-m', $cmdstr;
 
     } else {
 
@@ -455,17 +614,42 @@ sub config {
     }
 
     if ($hotplug) {
-	foreach_dimm($conf, $vmid, $memory, $static_memory, sub {
-	    my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
 
-	    my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size);
+	my $confmem = parse_memory($conf->{memory});
 
-	    push @$cmd, "-object" , $mem_object;
-	    push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode";
+	if ($confmem->{'virtio'}) {
 
-	    die "memory size ($memory) must be aligned to $dimm_size for hotplugging\n"
-		if $current_size > $memory;
-	});
+	    my $MAX_MEM = get_max_mem($conf);
+	    my $node_maxmem = ($MAX_MEM - $static_memory) / $sockets;
+	    my $node_mem = ($memory - $static_memory) / $sockets;
+	    my $blocksize = get_virtiomem_block_size($conf);
+
+	    die "memory need to be a multiple of $blocksize MiB with maxmemory $MAX_MEM MiB when virtiomem is enabled\n"
+		if $memory % $blocksize != 0;
+
+	    for (my $i = 0; $i < $sockets; $i++)  {
+
+		my $id = "virtiomem$i";
+		my $mem_object = print_mem_object($conf, "mem-$id", $node_maxmem);
+		push @$cmd, "-object" , "$mem_object,reserve=off";
+
+		my $mem_device = "virtio-mem-pci,block-size=${blocksize}M,requested-size=${node_mem}M,id=$id,memdev=mem-$id,node=$i";
+		$mem_device .= ",prealloc=on" if $conf->{hugepages};
+		$mem_devices->{$id} = $mem_device;
+	    }
+	} else {
+	    foreach_dimm($conf, $vmid, $memory, $static_memory, sub {
+		my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
+
+		my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size);
+
+		push @$cmd, "-object" , $mem_object;
+		push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode";
+
+		die "memory size ($memory) must be aligned to $dimm_size for hotplugging\n"
+		    if $current_size > $memory;
+	    });
+	}
     }
 }
 
@@ -476,8 +660,11 @@ sub print_mem_object {
 
 	my $hugepages_size = hugepages_size($conf, $size);
 	my $path = hugepages_mount_path($hugepages_size);
+	my $confmem = parse_memory($conf->{memory});
 
-	return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
+	my $object = "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on";
+	$object .= ",prealloc=yes" if !$confmem->{virtio};
+	return $object;
     } else {
 	return "memory-backend-ram,id=$id,size=${size}M";
     }
diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm
index 1673041..722c56f 100644
--- a/PVE/QemuServer/PCI.pm
+++ b/PVE/QemuServer/PCI.pm
@@ -261,6 +261,14 @@ sub get_pci_addr_map {
 	'scsihw2' => { bus => 4, addr => 1 },
 	'scsihw3' => { bus => 4, addr => 2 },
 	'scsihw4' => { bus => 4, addr => 3 },
+	'virtiomem0' => { bus => 4, addr => 4 },
+	'virtiomem1' => { bus => 4, addr => 5 },
+	'virtiomem2' => { bus => 4, addr => 6 },
+	'virtiomem3' => { bus => 4, addr => 7 },
+	'virtiomem4' => { bus => 4, addr => 8 },
+	'virtiomem5' => { bus => 4, addr => 9 },
+	'virtiomem6' => { bus => 4, addr => 10 },
+	'virtiomem7' => { bus => 4, addr => 11 },
     } if !defined($pci_addr_map);
     return $pci_addr_map;
 }
-- 
2.39.2





More information about the pve-devel mailing list