[pve-devel] [PATCH v4 container 1/1] Add device passthrough

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Nov 15 15:14:50 CET 2023


concept wise this looks pretty much OK, but a few (mostly code-style) comments in line

Am 13/11/2023 um 11:30 schrieb Filip Schauer:
> Add a dev[n] argument to the container config to pass devices through to
> a container. A device can be passed by its path. Additionally the access
> mode, uid and gid can be specified through their respective properties.
> 
> Signed-off-by: Filip Schauer <f.schauer at proxmox.com>
> ---
>  src/PVE/LXC.pm            | 57 +++++++++++++++++++++++-
>  src/PVE/LXC/Config.pm     | 93 +++++++++++++++++++++++++++++++++++++++
>  src/PVE/LXC/Tools.pm      | 23 +++++++---
>  src/lxc-pve-autodev-hook  | 18 +++++++-
>  src/lxc-pve-prestart-hook | 60 ++++++++++++++++++++++++-
>  5 files changed, 242 insertions(+), 9 deletions(-)
> 
> diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
> index 7ec816b..242b2a0 100644
> --- a/src/PVE/LXC.pm
> +++ b/src/PVE/LXC.pm
> @@ -5,7 +5,7 @@ use warnings;
>  
>  use Cwd qw();
>  use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
> -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
> +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
>  use File::Path;
>  use File::Spec;
>  use IO::Poll qw(POLLIN POLLHUP);
> @@ -639,6 +639,31 @@ sub update_lxc_config {
>  	$raw .= "lxc.mount.auto = sys:mixed\n";
>      }
>  
> +    PVE::LXC::Config->foreach_passthrough_device($conf, sub {
> +	my ($key, $device) = @_;
> +
> +	die "Path is not defined for passthrough device $key"
> +	    unless (defined($device->{path}));
> +
> +	my $absolute_path = $device->{path};
> +
> +	die "Device $absolute_path does not exist\n"
> +		unless (-e $absolute_path);
> +
> +	die "$absolute_path is not a device\n"
> +		unless (-b $absolute_path || -c $absolute_path);

Those are basically doing a stat each, but you do one below anyway,
so maybe use that one for the checks

> +
> +	my ($mode, $rdev) = (stat($absolute_path))[2, 6];
> +
> +	die "Could not get mode or device ID of $absolute_path\n"
> +	    if (!defined($mode) || !defined($rdev));
> +
> +	my $major = PVE::Tools::dev_t_major($rdev);
> +	my $minor = PVE::Tools::dev_t_minor($rdev);
> +	my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
> +	$raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
> +    });
> +
>      # WARNING: DO NOT REMOVE this without making sure that loop device nodes
>      # cannot be exposed to the container with r/w access (cgroup perms).
>      # When this is enabled mounts will still remain in the monitor's namespace
> @@ -1318,6 +1343,8 @@ sub check_ct_modify_config_perm {
>  	    $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']);
>  	    check_bridge_access($rpcenv, $authuser, $oldconf->{$opt}) if $oldconf->{$opt};
>  	    check_bridge_access($rpcenv, $authuser, $newconf->{$opt}) if $newconf->{$opt};
> +	} elsif ($opt =~ m/^dev\d+$/) {
> +	    raise_perm_exc("configuring device passthrough is only allowed for root\@pam");
>  	} elsif ($opt eq 'nameserver' || $opt eq 'searchdomain' || $opt eq 'hostname') {
>  	    $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']);
>  	} elsif ($opt eq 'features') {
> @@ -2367,6 +2394,34 @@ sub validate_id_maps {
>      }
>  }
>  
> +sub map_ct_id_to_host {
> +    my ($id, $id_map, $id_type) = @_;
> +
> +    foreach my $mapping (@$id_map) {

use for over foreach

> +	my ($type, $ct, $host, $length) = @$mapping;
> +
> +	next if ($type ne $id_type);
> +
> +	if ($id >= int($ct) && $id < ($ct + $length)) {
> +	    return $host - $ct + $id;

do we want to throw around some int() here too for good measure?

> +	}
> +    }
> +
> +    return $id;
> +}
> +
> +sub map_ct_uid_to_host {
> +    my ($uid, $id_map) = @_;
> +
> +    return map_ct_id_to_host($uid, $id_map, 'u');
> +}
> +
> +sub map_ct_gid_to_host {
> +    my ($gid, $id_map) = @_;
> +
> +    return map_ct_id_to_host($gid, $id_map, 'g');
> +}
> +
>  sub userns_command {
>      my ($id_map) = @_;
>      if (@$id_map) {
> diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
> index 56e1f10..9f325f2 100644
> --- a/src/PVE/LXC/Config.pm
> +++ b/src/PVE/LXC/Config.pm
> @@ -29,6 +29,7 @@ mkdir $lockdir;
>  mkdir "/etc/pve/nodes/$nodename/lxc";
>  my $MAX_MOUNT_POINTS = 256;
>  my $MAX_UNUSED_DISKS = $MAX_MOUNT_POINTS;
> +my $MAX_DEVICES = 256;
>  
>  # BEGIN implemented abstract methods from PVE::AbstractConfig
>  
> @@ -908,6 +909,71 @@ for (my $i = 0; $i < $MAX_UNUSED_DISKS; $i++) {
>      }
>  }
>  
> +PVE::JSONSchema::register_format('pve-lxc-dev-string', \&verify_lxc_dev_string);
> +sub verify_lxc_dev_string {
> +    my ($dev, $noerr) = @_;
> +
> +    if (
> +	$dev =~ m@/\.\.?/@ ||
> +	$dev =~ m@/\.\.?$@ ||

could be a single regex:

$dev =~ @/\.\.?(?:/|$)@

but no hard feelings, all variant are not easily readable and need close
checking anyway (iow. like most regexes)

> +	$dev !~ m!^/dev/!
> +    ) {
> +	return undef if $noerr;
> +	die "$dev is not a valid device path\n";
> +    }
> +
> +    return $dev;
> +}
> +
> +PVE::JSONSchema::register_format('file-access-mode-string', \&verify_file_access_mode);
> +sub verify_file_access_mode {
> +    my ($mode, $noerr) = @_;
> +
> +    if ($mode !~ /^[0-7]*$/) {

this would allow an empty mode though? Also, not sure if we want to allow
partial modes like 77 ?

> +	return undef if $noerr;
> +	die "$mode is not a valid file access mode\n";
> +    }
> +
> +    return $mode;
> +}
> +
> +my $dev_desc = {
> +    path => {
> +	optional => 1,
> +	type => 'string',
> +	default_key => 1,
> +	format => 'pve-lxc-dev-string',
> +	format_description => 'Path',
> +	description => 'Device to pass through to the container',
> +	verbose_description => 'Path to the device to pass through to the container',
> +    },
> +    mode => {
> +	optional => 1,
> +	type => 'integer',
> +	format => 'file-access-mode-string',
> +	format_description => 'Octal access mode',
> +	description => 'Access mode to be set on the device node',
> +    },
> +    uid => {
> +	optional => 1,
> +	type => 'integer',
> +	description => 'User ID to be assigned to the device node',
> +    },
> +    gid => {
> +	optional => 1,
> +	type => 'integer',
> +	description => 'Group ID to be assigned to the device node',
> +    },
> +};
> +
> +for (my $i = 0; $i < $MAX_DEVICES; $i++) {
> +    $confdesc->{"dev$i"} = {
> +	optional => 1,
> +	type => 'string', format => $dev_desc,
> +	description => "Device to pass through to the container",
> +    }
> +}
> +
>  sub parse_pct_config {
>      my ($filename, $raw, $strict) = @_;
>  
> @@ -1255,6 +1321,21 @@ sub parse_volume {
>      return;
>  }
>  
> +sub parse_device {
> +    my ($class, $device_string, $noerr) = @_;
> +
> +    my $res;
> +    eval { $res = PVE::JSONSchema::parse_property_string($dev_desc, $device_string) };

you can write above as:

my $res = eval { PVE::JSONSchema::parse_property_string($dev_desc, $device_string) };

which makes it slightly nicer

> +    if ($@) {
> +	return undef if $noerr;
> +	die $@;
> +    }
> +
> +    die "Path has to be defined" unless (defined($res->{path}));

We do not use `unless`, it's basically one of the few things our code universally
(well almost, nobody is perfect..) agrees too, so lets keep it that way and use

... if !defined(...);

also, shouldn't a set $noerr also disarm this die? I.e.:

die "..." if !$noerr && !defined(...);

> +
> +    return $res;
> +}
> +
>  sub print_volume {
>      my ($class, $key, $volume) = @_;
>  
> @@ -1762,4 +1843,16 @@ sub get_derived_property {
>      }
>  }
>  
> +sub foreach_passthrough_device {
> +    my ($class, $conf, $func, @param) = @_;
> +
> +    foreach my $key (keys %$conf) {

prefer for over foreach for new code (no semantic reason, just for consistency)

> +	next if $key !~ m/^dev(\d+)$/;
> +
> +	my $device = $class->parse_device($conf->{$key});
> +
> +	$func->($key, $device, @param);
> +    }
> +}
> +
>  1;
> diff --git a/src/PVE/LXC/Tools.pm b/src/PVE/LXC/Tools.pm
> index 62cdbc1..c9fb834 100644
> --- a/src/PVE/LXC/Tools.pm
> +++ b/src/PVE/LXC/Tools.pm
> @@ -81,10 +81,9 @@ sub lxc_hook($$&) {
>      $code->($ct_name, $common_vars, $namespaces, $args);
>  }
>  
> -sub for_current_devices($&) {
> -    my ($vmid, $code) = @_;
> +sub for_devices {
> +    my ($devlist_file, $vmid, $code) = @_;
>  
> -    my $devlist_file = "/var/lib/lxc/$vmid/devices";
>      my $fd;
>  
>      if (! open $fd, '<', $devlist_file) {
> @@ -93,8 +92,8 @@ sub for_current_devices($&) {
>      }
>  
>      while (defined(my $line = <$fd>)) {
> -	if ($line !~ m@^(b):(\d+):(\d+):/dev/(\S+)\s*$@) {
> -	    warn "invalid .pve-devices entry: $line\n";
> +	if ($line !~ m@^(b|c):(\d+):(\d+):/dev/(\S+)\s*$@) {
> +	    warn "invalid $devlist_file entry: $line\n";
>  	    next;
>  	}
>  
> @@ -117,6 +116,20 @@ sub for_current_devices($&) {
>      close $fd;
>  }
>  
> +sub for_current_devices($&) {
> +    my ($vmid, $code) = @_;
> +
> +    my $devlist_file = "/var/lib/lxc/$vmid/devices";
> +    for_devices($devlist_file, $vmid, $code);
> +}
> +
> +sub for_passthrough_devices($&) {
> +    my ($vmid, $code) = @_;
> +
> +    my $passthrough_devlist_file = "/var/lib/lxc/$vmid/passthrough_devices";
> +    for_devices($passthrough_devlist_file, $vmid, $code);
> +}
> +
>  sub cgroup_do_write($$) {
>      my ($path, $value) = @_;
>      my $fd;
> diff --git a/src/lxc-pve-autodev-hook b/src/lxc-pve-autodev-hook
> index 3c45949..105465f 100755
> --- a/src/lxc-pve-autodev-hook
> +++ b/src/lxc-pve-autodev-hook
> @@ -3,17 +3,31 @@
>  use strict;
>  use warnings;
>  
> -use File::Path;
> +use Fcntl qw(S_IFREG);
>  use File::Basename;
> +use File::Path;
>  
>  use PVE::LXC::Tools;
> -use PVE::Tools;
> +use PVE::Tools qw(MS_BIND);
>  
>  PVE::LXC::Tools::lxc_hook('autodev', 'lxc', sub {
>      my ($vmid, $vars, undef, undef) = @_;
>  
>      my $root = $vars->{ROOTFS_MOUNT};
>  
> +    PVE::LXC::Tools::for_passthrough_devices($vmid, sub {
> +	my ($type, $major, $minor, $dev) = @_;
> +
> +	my $rel_devpath = "/dev/$dev";
> +	my $rel_dir = dirname($rel_devpath);
> +	File::Path::mkpath("$root/$rel_dir");
> +	PVE::Tools::mknod("$root/dev/$dev", S_IFREG, 0)
> +	    or die("Could not mknod $root/dev/$dev");

Please include $! in the error

> +
> +	PVE::Tools::mount("/var/lib/lxc/$vmid/passthrough/dev/$dev", "$root/dev/$dev", 0, MS_BIND, 0)
> +	    or die("Bind mount of device $dev into container failed\n");

same here, add $! in error

> +    });
> +
>      PVE::LXC::Tools::for_current_devices($vmid, sub {
>  	my ($type, $major, $minor, $dev) = @_;
>  
> diff --git a/src/lxc-pve-prestart-hook b/src/lxc-pve-prestart-hook
> index 936d0bf..d8b0106 100755
> --- a/src/lxc-pve-prestart-hook
> +++ b/src/lxc-pve-prestart-hook
> @@ -6,6 +6,7 @@ use strict;
>  use warnings;
>  
>  use Fcntl qw(O_DIRECTORY :mode);
> +use File::Basename;
>  use File::Path;
>  use POSIX;
>  
> @@ -62,7 +63,7 @@ PVE::LXC::Tools::lxc_hook('pre-start', 'lxc', sub {
>      unlink $devlist_file;
>      my $devices = [];
>  
> -    my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf);
> +    my ($id_map, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf);
>  
>      # Unmount first when the user mounted the container with "pct mount".
>      eval {
> @@ -118,6 +119,51 @@ PVE::LXC::Tools::lxc_hook('pre-start', 'lxc', sub {
>  
>      PVE::LXC::Config->foreach_volume($conf, $setup_mountpoint);
>  
> +    # Device passthrough
> +    my $passthrough_devlist_file = "/var/lib/lxc/$vmid/passthrough_devices";

would favor kebab-case for filenames too

> +    unlink $passthrough_devlist_file;

Would be good to have error handling there:

unlink or $!{ENOENT} or die "failed to clean-up passthrough device list file - $!\n"



> +    my $passthrough_devices = [];
> +
> +    my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";

no hard feelings, but what about using a common directory for the device
list file and the mountpoint:

/var/lib/lxc/$vmid/passthrough/devices
/var/lib/lxc/$vmid/passthrough/mount

But I did not really check the overall picture, so this can be fine as is
(well, with snake_case -> kebab-case)

> +    File::Path::make_path($passthrough_dir);
> +    PVE::Tools::mount("none", $passthrough_dir, "tmpfs", 0, "size=8k")
> +        or die ("Could not mount tmpfs for device passthrough at $passthrough_dir");

please include $! in the error message so that we have a better idea what
may be going on if a user reports running into this error.

> +
> +    my $setup_passthrough_device = sub {
> +	my ($key, $device) = @_;
> +
> +	my $absolute_path = $device->{path};
> +	my ($mode, $rdev) = (stat($absolute_path))[2, 6];
> +
> +	die "Could not get mode or device ID of $absolute_path\n"
> +	    if (!defined($mode) || !defined($rdev));
> +
> +	my $passthrough_device_path = $passthrough_dir . $absolute_path;
> +	File::Path::make_path(dirname($passthrough_device_path));
> +	PVE::Tools::mknod($passthrough_device_path, $mode, $rdev)
> +	    or die("failed to mknod $passthrough_device_path\n");

please include $! in the error

> +
> +	# Use chmod because umask could mess with the access mode on mknod
> +	my $passthrough_mode = 0660;
> +	$passthrough_mode = oct($device->{mode}) if (defined($device->{mode}));
> +	chmod $passthrough_mode, $passthrough_device_path
> +	    or die "failed to chmod $passthrough_mode $passthrough_device_path: $!\n";
> +
> +	# Set uid and gid of the device node
> +	my $uid = 0;
> +	my $gid = 0;
> +	$uid = $device->{uid} if (defined($device->{uid}));
> +	$gid = $device->{gid} if (defined($device->{gid}));

you can drop the outer parenthesis in the post-if's expression for both of above lines
for consistency.

> +	$uid = PVE::LXC::map_ct_uid_to_host($uid, $id_map);
> +	$gid = PVE::LXC::map_ct_gid_to_host($gid, $id_map);
> +	chown $uid, $gid, $passthrough_device_path
> +	    or die("failed to chown $uid:$gid $passthrough_device_path: $!\n");
> +
> +	push @$passthrough_devices, [$absolute_path, $mode, $rdev];
> +    };
> +
> +    PVE::LXC::Config->foreach_passthrough_device($conf, $setup_passthrough_device);
> +
>      my $lxc_setup = PVE::LXC::Setup->new($conf, $rootdir);
>      $lxc_setup->pre_start_hook();
>  
> @@ -140,6 +186,18 @@ PVE::LXC::Tools::lxc_hook('pre-start', 'lxc', sub {
>  	}
>  	PVE::Tools::file_set_contents($devlist_file, $devlist);
>      }
> +
> +    if (@$passthrough_devices) {
> +	my $devlist = '';
> +	foreach my $dev (@$passthrough_devices) {

s/foreach/for/

> +	    my ($path, $mode, $rdev) = @$dev;
> +	    my $major = PVE::Tools::dev_t_major($rdev);
> +	    my $minor = PVE::Tools::dev_t_minor($rdev);
> +	    my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
> +	    $devlist .= "$device_type_char:$major:$minor:$path\n";
> +	}
> +	PVE::Tools::file_set_contents($passthrough_devlist_file, $devlist);
> +    }
>  });
>  
>  # Leftover cgroups prevent lxc from starting without any useful information






More information about the pve-devel mailing list