[pve-devel] [RFC common v3 10/34] env: add module with helpers to run a Perl subroutine in a user namespace

Tue Nov 12 15:20:38 CET 2024

On November 7, 2024 5:51 pm, Fiona Ebner wrote:
> The first use case is running the container backup subroutine for
> external providers inside a user namespace. That allows them to see
> the filesystem to back-up from the containers perspective and also
> improves security because of isolation.
> 
> Copied and adapted the relevant parts from the pve-buildpkg
> repository.
> 
> Originally-by: Wolfgang Bumiller <w.bumiller at proxmox.com>
> [FE: add $idmap parameter, drop $aux_groups parameter]
> Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
> ---
> 
> New in v3.
> 
>  src/Makefile   |   1 +
>  src/PVE/Env.pm | 136 +++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 137 insertions(+)
>  create mode 100644 src/PVE/Env.pm
> 
> diff --git a/src/Makefile b/src/Makefile
> index 2d8bdc4..dba26e3 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -15,6 +15,7 @@ LIB_SOURCES = \
>  	Certificate.pm \
>  	CpuSet.pm \
>  	Daemon.pm \
> +	Env.pm \
>  	Exception.pm \
>  	Format.pm \
>  	INotify.pm \
> diff --git a/src/PVE/Env.pm b/src/PVE/Env.pm
> new file mode 100644
> index 0000000..e11bec0
> --- /dev/null
> +++ b/src/PVE/Env.pm
> @@ -0,0 +1,136 @@
> +package PVE::Env;

I agree with Thomas that this name might be a bit too generic ;)

I also wonder - since this seems to be only used in pve-container, and
it really mostly makes sense in that context, wouldn't it be better off
there? or do we expect other areas where we need userns handling?
(granted, some of the comments below would require other changes to
pve-common anyway ;))

> +
> +use strict;
> +use warnings;
> +
> +use Fcntl qw(O_WRONLY);
> +use POSIX qw(EINTR);
> +use Socket;
> +
> +require qw(syscall.ph);

PVE::Syscall already does this, and has the following:

BEGIN {
    die "syscall.ph can only be required once!\n" if $INC{'syscall.ph'};
    require("syscall.ph");

don't those two clash? I think those syscall related parts should
probably move there?

> +
> +use constant {CLONE_NEWNS   => 0x00020000,
> +              CLONE_NEWUSER => 0x10000000};
> +
> +sub unshare($) {
> +    my ($flags) = @_;
> +    return 0 == syscall(272, $flags);
> +}

this is PVE::Tools::unshare, maybe the latter should move here?

> +
> +sub __set_id_map($$$) {
> +    my ($pid, $what, $value) = @_;
> +    sysopen(my $fd, "/proc/$pid/${what}_map", O_WRONLY)
> +	or die "failed to open child process' ${what}_map\n";
> +    my $rc = syswrite($fd, $value);
> +    if (!$rc || $rc != length($value)) {
> +	die "failed to set sub$what: $!\n";
> +    }
> +    close($fd);
> +}
> +
> +sub set_id_map($$) {
> +    my ($pid, $id_map) = @_;
> +
> +    my $gid_map = '';
> +    my $uid_map = '';
> +
> +    for my $map ($id_map->@*) {
> +	my ($type, $ct, $host, $length) = $map->@*;
> +
> +	$gid_map .= "$ct $host $length\n" if $type eq 'g';
> +	$uid_map .= "$ct $host $length\n" if $type eq 'u';
> +    }
> +
> +    __set_id_map($pid, 'gid', $gid_map) if $gid_map;
> +    __set_id_map($pid, 'uid', $uid_map) if $uid_map;
> +}

do we gain a lot here from not just using newuidmap/newgidmap?

> +
> +sub wait_for_child($;$) {
> +    my ($pid, $noerr) = @_;
> +    my $interrupts = 0;
> +    while (waitpid($pid, 0) != $pid) {
> +	if ($! == EINTR) {
> +	    warn "interrupted...\n";
> +	    kill(($interrupts > 3 ? 9 : 15), $pid);
> +	    $interrupts++;
> +	}
> +    }
> +    my $status = POSIX::WEXITSTATUS($?);
> +    return $status if $noerr;
> +
> +    if ($? == -1) {
> +	die "failed to execute\n";
> +    } elsif (POSIX::WIFSIGNALED($?)) {
> +	my $sig = POSIX::WTERMSIG($?);
> +	die "got signal $sig\n";
> +    } elsif ($status != 0) {
> +	warn "exit code $status\n";
> +    }
> +    return $status;
> +}
> +
> +sub forked(&%) {

this seems very similar to the already existing PVE::Tools::run_fork /
run_fork_with_timeout helpers.. any reason we can't extend those with
`afterfork` support and use them?

> +    my ($code, %opts) = @_;
> +
> +    pipe(my $except_r, my $except_w) or die "pipe: $!\n";
> +
> +    my $pid = fork();
> +    die "fork failed: $!\n" if !defined($pid);
> +
> +    if ($pid == 0) {
> +	close($except_r);
> +	eval { $code->() };
> +	if ($@) {
> +	    print {$except_w} $@;
> +	    $except_w->flush();
> +	    POSIX::_exit(1);
> +	}
> +	POSIX::_exit(0);
> +    }
> +    close($except_w);
> +
> +    my $err;
> +    if (my $afterfork = $opts{afterfork}) {
> +	eval { $afterfork->($pid); };
> +	if ($err = $@) {
> +	    kill(15, $pid);
> +	    $opts{noerr} = 1;
> +	}
> +    }
> +    if (!$err) {
> +	$err = do { local $/ = undef; <$except_r> };
> +    }
> +    my $rv = wait_for_child($pid, $opts{noerr});
> +    die $err if $err;
> +    die "an unknown error occurred\n" if $rv != 0;
> +    return $rv;
> +}
> +
> +sub run_in_userns(&;$) {
> +    my ($code, $id_map) = @_;
> +    socketpair(my $sp, my $sc, AF_UNIX, SOCK_STREAM, PF_UNSPEC)
> +	or die "socketpair: $!\n";
> +    forked(sub {
> +	close($sp);
> +	unshare(CLONE_NEWUSER|CLONE_NEWNS) or die "unshare(NEWUSER|NEWNS): $!\n";

I guess we can't set our "own" maps here for lack of capabilities and
avoid the whole afterfork thing entirely? at least I couldn't get it to
work ;)

> +	syswrite($sc, "1\n") == 2 or die "write: $!\n";
> +	shutdown($sc, 1);
> +	my $two = <$sc>;
> +	die "failed to sync with parent process\n" if $two ne "2\n";
> +	close($sc);
> +	$! = undef;
> +	($(, $)) = (0, 0); die "$!\n" if $!;
> +	($<, $>) = (0, 0); die "$!\n" if $!;
> +	$code->();
> +    }, afterfork => sub {
> +	my ($pid) = @_;
> +	close($sc);
> +	my $one = <$sp>;
> +	die "failed to sync with userprocess\n" if $one ne "1\n";
> +	set_id_map($pid, $id_map);
> +	syswrite($sp, "2\n") == 2 or die "write: $!\n";
> +	close($sp);
> +    });
> +}
> +
> +1;
> -- 
> 2.39.5
> 
> 
> 
> _______________________________________________
> pve-devel mailing list
> pve-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
> 
> 
>