[pve-devel] [PATCH qemu-server 3/3] pci: mdev: adapt to nvidia interface with kernel >= 6.8

Dominik Csapak d.csapak at proxmox.com
Tue Aug 6 14:22:02 CEST 2024


since kernel 6.8, NVIDIAs vGPU driver does not use the generic mdev
interface anymore, since they relied on a feature there which is not
available anymore. IIUC the kernel [0] recommends drivers to implement
their own device specific features since putting all in the generic one
does not make sense.

They now have an 'nvidia' folder in the device sysfs path, which
contains the files `creatable_vgpu_types`/`current_vgpu_type` to
control the virtual functions model, and then the whole virtual function
has to be passed through (although without resetting and changing to the
vfio-pci driver).

This patch implements changes so that from a config perspective, it
still is an mediated device, and we map the functionality iff the device
has no mediated devices but the new NVIDIAs sysfsapi and the model name
is 'nvidia-<..>'

It behaves a bit different than mdevs and normal pci passthrough, as we
have to choose the correct device immediately since it's bound to the
pciid, but we must not bind the device to vfio-pci as the NVIDIA driver
implements this functionality itself.

When cleaning up, we iterate over all reserved devices (since for a
mapping we can't know at this point which was chosen besides looking at
the reservations) and reset the vgpu model to '0', so it frees up the
reservation from NVIDIAs side. (We also do that in a loop, since it's
not always immediately ready after QEMU closes)

A general problem (but that was previously also the case) is that a
showcmd (for a not running guest) reserves the pciids, which might block
an execution of a different real vm. This is now a bit more problematic
as we (temporarily) set the vgpu type then.

0: https://docs.kernel.org/driver-api/vfio-pci-device-specific-driver-acceptance.html

Signed-off-by: Dominik Csapak <d.csapak at proxmox.com>
---
 PVE/QemuServer.pm                | 25 ++++++++--
 PVE/QemuServer/PCI.pm            | 80 ++++++++++++++++++++++++++++++--
 test/run_config2command_tests.pl |  3 ++
 3 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index b2cbe00e..981e8fa7 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -5833,8 +5833,8 @@ sub vm_start_nolock {
 
 	    my $chosen_mdev;
 	    for my $dev ($d->{ids}->@*) {
-		my $info = eval { PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $index, $d->{mdev}) };
-		if ($d->{mdev}) {
+		my $info = eval { PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $index, $d) };
+		if ($d->{mdev} || $d->{nvidia}) {
 		    warn $@ if $@;
 		    $chosen_mdev = $info;
 		    last if $chosen_mdev; # if successful, we're done
@@ -5843,7 +5843,7 @@ sub vm_start_nolock {
 		}
 	    }
 
-	    next if !$d->{mdev};
+	    next if !$d->{mdev} && !$d->{nvidia};
 	    die "could not create mediated device\n" if !defined($chosen_mdev);
 
 	    # nvidia grid needs the uuid of the mdev as qemu parameter
@@ -6175,6 +6175,25 @@ sub cleanup_pci_devices {
     # templates don't use pci devices
     return if $conf->{template};
 
+    my $reservations = PVE::QemuServer::PCI::get_reservations($vmid);
+    # clean up nvidia devices
+    for my $id ($reservations->@*) {
+	$id = '0000:'.$id if $id !~ m/^0000:/;
+
+	my $create_path = "/sys/bus/pci/devices/$id/nvidia/current_vgpu_type";
+
+	next if ! -f $create_path;
+
+	for (my $i = 0; $i < 10; $i++) {
+	    last if file_read_firstline($create_path) eq "0";
+	    sleep 1;
+	    PVE::SysFSTools::file_write($create_path, "0");
+	}
+	if (file_read_firstline($create_path) ne "0") {
+	    warn "could not cleanup nvidia vgpu for '$id'\n";
+	}
+    }
+
     foreach my $key (keys %$conf) {
 	next if $key !~ m/^hostpci(\d+)$/;
 	my $hostpciindex = $1;
diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm
index ae180e08..1ea043bb 100644
--- a/PVE/QemuServer/PCI.pm
+++ b/PVE/QemuServer/PCI.pm
@@ -453,7 +453,12 @@ sub parse_hostpci {
 	}
 	if (scalar($ids->@*) > 1) {
 	    $res->{'has-multifunction'} = 1;
-	    die "cannot use mediated device with multifunction device\n" if $res->{mdev};
+	    die "cannot use mediated device with multifunction device\n" if $res->{mdev} || $res->{nvidia};
+	} elsif ($res->{mdev}) {
+	    if ($ids->[0]->{nvidia} && $res->{mdev} =~ m/^nvidia-(\d+)$/) {
+		$res->{nvidia} = $1;
+		delete $res->{mdev};
+	    }
 	}
 	push $res->{ids}->@*, $ids;
     }
@@ -497,7 +502,7 @@ sub parse_hostpci_devices {
 	    die "legacy IGD assignment is not compatible with x-vga\n"
 		if $d->{'x-vga'};
 	    die "legacy IGD assignment is not compatible with mdev\n"
-		if $d->{mdev};
+		if $d->{mdev} || $d->{nvidia};
 	    die "legacy IGD assignment is not compatible with q35\n"
 		if $q35;
 	    die "legacy IGD assignment is not compatible with multifunction devices\n"
@@ -515,6 +520,41 @@ sub parse_hostpci_devices {
     return $parsed_devices;
 }
 
+# set vgpu type of a vf of an nvidia gpu with kernel 6.8 or newer
+my sub create_nvidia_device {
+    my ($id, $model) = @_;
+
+    $id = '0000:'.$id if $id !~ m/^0000:/;
+
+    my $creation = "/sys/bus/pci/devices/$id/nvidia/current_vgpu_type";
+
+    die "no nvidia sysfs api for '$id'\n" if ! -f $creation;
+
+    my $current = PVE::Tools::file_read_firstline($creation);
+    if ($current ne "0") {
+	return 1 if $current eq $model;
+	# reset vgpu type so we can see all available and set the real device
+	die "unable to reset vgpu type for '$id'\n" if !PVE::SysFSTools::file_write($creation, "0");
+    }
+
+    my $types = PVE::SysFSTools::get_mdev_types($id);
+    my $selected;
+    for my $type_definition ($types->@*) {
+	next if $type_definition->{type} ne "nvidia-$model";
+	$selected = $type_definition;
+    }
+
+    if (!defined($selected) || $selected->{available} < 1) {
+	die "vgpu type '$model' not available for '$id'\n";
+    }
+
+    if (!PVE::SysFSTools::file_write($creation, $model)) {
+	die "could not set vgpu type to '$model' for '$id'\n";
+    }
+
+    return 1;
+}
+
 # takes the hash returned by parse_hostpci_devices and for all non mdev gpus,
 # selects one of the given alternatives by trying to reserve it
 #
@@ -541,7 +581,7 @@ my sub choose_hostpci_devices {
 	my $device = $devices->{"hostpci$i"};
 	next if !$device;
 
-	if ($device->{mdev}) {
+	if ($device->{mdev} && !$device->{nvidia}) {
 	    $device->{ids} = [ map { $_->[0] } $device->{ids}->@* ];
 	    next;
 	}
@@ -550,6 +590,10 @@ my sub choose_hostpci_devices {
 	    # we only have one alternative, use that
 	    $device->{ids} = $device->{ids}->[0];
 	    $add_used_device->($device->{ids});
+	    if ($device->{nvidia} && !$is_running) {
+		reserve_pci_usage($device->{ids}->[0]->{id}, $vmid, 10, undef);
+		create_nvidia_device($device->{ids}->[0]->{id}, $device->{nvidia});
+	    }
 	    next;
 	}
 
@@ -563,6 +607,15 @@ my sub choose_hostpci_devices {
 		next if $@;
 	    }
 
+	    if ($device->{nvidia} && !$is_running) {
+		eval { create_nvidia_device($ids->[0], $device->{nvidia}) };
+		if (my $err = $@) {
+		    warn $err;
+		    remove_pci_reservation($vmid, $ids);
+		    next;
+		}
+	    }
+
 	    # found one that is not used or reserved
 	    $add_used_device->($alternative);
 	    $device->{ids} = $alternative;
@@ -661,13 +714,15 @@ sub print_hostpci_devices {
 }
 
 sub prepare_pci_device {
-    my ($vmid, $pciid, $index, $mdev) = @_;
+    my ($vmid, $pciid, $index, $device) = @_;
 
     my $info = PVE::SysFSTools::pci_device_info("$pciid");
     die "cannot prepare PCI pass-through, IOMMU not present\n" if !PVE::SysFSTools::check_iommu_support();
     die "no pci device info for device '$pciid'\n" if !$info;
 
-    if ($mdev) {
+    if ($device->{nvidia}) {
+	# nothing to do
+    } elsif (my $mdev = $device->{mdev}) {
 	my $uuid = generate_mdev_uuid($vmid, $index);
 	PVE::SysFSTools::pci_create_mdev_device($pciid, $uuid, $mdev);
     } else {
@@ -734,6 +789,21 @@ sub remove_pci_reservation {
     die $@ if $@;
 }
 
+# return all currently reserved ids from the given vmid
+sub get_reservations {
+    my ($vmid) = @_;
+
+    my $reservations = $parse_pci_reservation_unlocked->();
+
+    my $list = [];
+
+    for my $pci_id (sort keys $reservations->%*) {
+	push $list->@*, $pci_id if $reservations->{$pci_id}->{vmid} == $vmid;
+    }
+
+    return $list;
+}
+
 sub reserve_pci_usage {
     my ($requested_ids, $vmid, $timeout, $pid) = @_;
 
diff --git a/test/run_config2command_tests.pl b/test/run_config2command_tests.pl
index 9b5e87ff..8c525f09 100755
--- a/test/run_config2command_tests.pl
+++ b/test/run_config2command_tests.pl
@@ -387,6 +387,9 @@ $pci_module->mock(
 
 	return undef;
     },
+    create_nvidia_device => sub {
+	return 1;
+    }
 );
 
 sub diff($$) {
-- 
2.39.2





More information about the pve-devel mailing list