[pve-devel] [RFC DO NOT APPLY qemu-server] WIP: pci: mdev: adapt to nvidia interface with kernel >= 6.8
Dominik Csapak
d.csapak at proxmox.com
Tue Jul 23 10:23:36 CEST 2024
Note: this is not intended to be applied, but more of a POC.
since kernel 6.8, NVIDIAs vGPU driver does not use the generic mdev
interface anymore, since they relied on a feature there which is not
available anymore. IIUC the kernel [0] recommends drivers to implement
their own device specific features since putting all in the generic one
does not make sense.
They now have an 'nvidia' folder in the device sysfs path, which
contains the files `createable_vgpu_types`/`current_vgpu_type` to
control the virtual functions model, and then the whole virtual function
has to be passed through (although without resetting and changing to the
vfio-pci driver).
This patch implements a very basic (and incomplete) workaround for
making it work with our configs again, to show what has to be touched
(and what we would have to do). For this i have special cased the nvidia
part several times, but IMHO this is not ideal.
For a proper fix, I'd suggest the following:
Invent an interface that does everything we need (e.g. list models,
availability, create, cleanup, get qemu cli option(host/sysfsdev part
only), etc.) and implement a plugin system with 3 plugins:
raw,mdev,nvidia
then we'd change the pci config to e.g. instead of having
hostpci0: mapping=foo,mdev=nvidia-xyz
we'd have for nvidia:
hostpci0: mapping=foo,kind=nvidia,model=xyz
or for devices which implement mdevs:
hostpci0: mapping=foo,kind=mdev,model=mdev-model
With a legacy handling of the 'mdev' property to map to the nvidia one
in case the device does not support mdevs and it begins with 'nvidia-'.
This way we can implement new interfaces much more quickly should they
arise, and the resulting pci passthrough code should also be a bit
cleaner, since the kind specific things are hidden in the relevant
plugins.
Does that sound sensible? I can of course just extend the current
implementation and we can pretend the nvidia driver is still using
mediated devices, but we'd map it everywhere (including the mdev
scanning api; TBD)
0: https://docs.kernel.org/driver-api/vfio-pci-device-specific-driver-acceptance.html
Signed-off-by: Dominik Csapak <d.csapak at proxmox.com>
---
PVE/QemuServer.pm | 28 ++++++++++++++++--
PVE/QemuServer/PCI.pm | 66 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 85 insertions(+), 9 deletions(-)
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index bf59b091..e7a3557c 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -5835,8 +5835,16 @@ sub vm_start_nolock {
my $chosen_mdev;
for my $dev ($d->{ids}->@*) {
- my $info = eval { PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $index, $d->{mdev}) };
+ my $virtual = { type => 'raw' };
if ($d->{mdev}) {
+ $virtual->{type} = 'mdev';
+ $virtual->{mdev} = $d->{mdev};
+ } elsif ($d->{nvidia}) {
+ $virtual->{type} = 'nvidia';
+ $virtual->{nvidia} = $d->{nvidia};
+ }
+ my $info = eval { PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $index, $virtual) };
+ if ($d->{mdev} || $d->{nvidia}) {
warn $@ if $@;
$chosen_mdev = $info;
last if $chosen_mdev; # if successful, we're done
@@ -5845,7 +5853,7 @@ sub vm_start_nolock {
}
}
- next if !$d->{mdev};
+ next if !$d->{mdev} && !$d->{nvidia};
die "could not create mediated device\n" if !defined($chosen_mdev);
# nvidia grid needs the uuid of the mdev as qemu parameter
@@ -6172,6 +6180,22 @@ sub cleanup_pci_devices {
# templates don't use pci devices
return if $conf->{template};
+ my $reservations = PVE::QemuServer::PCI::get_reservations($vmid);
+ # clean up nvidia devices
+ use Data::Dumper;
+ warn Dumper $reservations;
+ for my $id ($reservations->@*) {
+ $id = '0000:'.$id if $id !~ m/^0000:/;
+
+ warn Dumper $id;
+ my $create_path = "/sys/bus/pci/devices/$id/nvidia/current_vgpu_type";
+ sleep 1;
+ if (!PVE::SysFSTools::file_write($create_path, "0")) {
+ warn "could not set vgpu type to '0' for '$id'\n";
+ next;
+ }
+ }
+
foreach my $key (keys %$conf) {
next if $key !~ m/^hostpci(\d+)$/;
my $hostpciindex = $1;
diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm
index 1673041b..4b5a9732 100644
--- a/PVE/QemuServer/PCI.pm
+++ b/PVE/QemuServer/PCI.pm
@@ -447,13 +447,21 @@ sub parse_hostpci {
for my $alternative ($alternatives->@*) {
my $ids = [];
foreach my $id ($alternative->@*) {
- my $devs = PVE::SysFSTools::lspci($id);
+ my $devs = PVE::SysFSTools::lspci($id, 1);
die "no PCI device found for '$id'\n" if !scalar($devs->@*);
push $ids->@*, @$devs;
}
if (scalar($ids->@*) > 1) {
$res->{'has-multifunction'} = 1;
- die "cannot use mediated device with multifunction device\n" if $res->{mdev};
+ die "cannot use mediated device with multifunction device\n" if $res->{mdev} || $res->{nvidia};
+ } elsif ($res->{mdev}) {
+ if (!$ids->[0]->{mdev}) {
+ # mdev configured but no mdev on device, legacy config, map to nvidia?
+ if ($res->{mdev} =~ m/^nvidia-(.*)$/) {
+ $res->{nvidia} = $1;
+ delete $res->{mdev};
+ }
+ }
}
push $res->{ids}->@*, $ids;
}
@@ -497,7 +505,7 @@ sub parse_hostpci_devices {
die "legacy IGD assignment is not compatible with x-vga\n"
if $d->{'x-vga'};
die "legacy IGD assignment is not compatible with mdev\n"
- if $d->{mdev};
+ if $d->{mdev} | $d->{nvidia};
die "legacy IGD assignment is not compatible with q35\n"
if $q35;
die "legacy IGD assignment is not compatible with multifunction devices\n"
@@ -534,6 +542,21 @@ my sub choose_hostpci_devices {
}
};
+ my $create_nvidia_device = sub {
+ my ($id, $model) = @_;
+
+ $id = '0000:'.$id if $id !~ m/^0000:/;
+
+ my $creation = "/sys/bus/pci/devices/$id/nvidia/current_vgpu_type";
+ # TODO: parse createable_vgpu_types, check if available
+ if (!PVE::SysFSTools::file_write($creation, $model)) {
+ warn "could not set vgpu type to '$model' for '$id'\n";
+ return 0;
+ }
+
+ return 1;
+ };
+
for (my $i = 0; $i < $MAX_HOSTPCI_DEVICES; $i++) {
my $device = $devices->{"hostpci$i"};
next if !$device;
@@ -547,6 +570,10 @@ my sub choose_hostpci_devices {
# we only have one alternative, use that
$device->{ids} = $device->{ids}->[0];
$add_used_device->($device->{ids});
+ if ($device->{nvidia}) {
+ reserve_pci_usage($device->{ids}, $vmid, 10, undef);
+ $create_nvidia_device->($device->{ids}->[0], $device->{nvidia});
+ }
next;
}
@@ -559,6 +586,10 @@ my sub choose_hostpci_devices {
next if $@;
# found one that is not used or reserved
+ if ($device->{nvidia}) {
+ $create_nvidia_device->($ids->[0], $device->{nvidia});
+ }
+
$add_used_device->($alternative);
$device->{ids} = $alternative;
$found = 1;
@@ -656,20 +687,26 @@ sub print_hostpci_devices {
}
sub prepare_pci_device {
- my ($vmid, $pciid, $index, $mdev) = @_;
+ my ($vmid, $pciid, $index, $virtual) = @_;
+
+ $virtual //= { type => 'raw', };
my $info = PVE::SysFSTools::pci_device_info("$pciid");
die "cannot prepare PCI pass-through, IOMMU not present\n" if !PVE::SysFSTools::check_iommu_support();
die "no pci device info for device '$pciid'\n" if !$info;
- if ($mdev) {
+ if ($virtual->{type} eq 'nvidia') {
+ # nothing to do
+ } elsif ($virtual->{type} eq 'mdev') {
my $uuid = generate_mdev_uuid($vmid, $index);
- PVE::SysFSTools::pci_create_mdev_device($pciid, $uuid, $mdev);
- } else {
+ PVE::SysFSTools::pci_create_mdev_device($pciid, $uuid, $virtual->{mdev});
+ } elsif ($virtual->{type} eq 'raw') {
die "can't unbind/bind PCI group to VFIO '$pciid'\n"
if !PVE::SysFSTools::pci_dev_group_bind_to_vfio($pciid);
die "can't reset PCI device '$pciid'\n"
if $info->{has_fl_reset} && !PVE::SysFSTools::pci_dev_reset($info);
+ } else {
+ die "unknown virtual type '$virtual->{type}'\n";
}
return $info;
@@ -728,6 +765,21 @@ sub remove_pci_reservation {
die $@ if $@;
}
+# return all currently reserved ids from the given vmid
+sub get_reservations {
+ my ($vmid) = @_;
+
+ my $reservations = $parse_pci_reservation_unlocked->();
+
+ my $list = [];
+
+ for my $pci_id (sort keys $reservations->%*) {
+ push $list->@*, $pci_id if $reservations->{$pci_id}->{vmid} == $vmid;
+ }
+
+ return $list;
+}
+
sub reserve_pci_usage {
my ($requested_ids, $vmid, $timeout, $pid) = @_;
--
2.39.2
More information about the pve-devel
mailing list