[pve-devel] [PATCH container 1/4] add mknod feature flag
Wolfgang Bumiller
w.bumiller at proxmox.com
Thu Jan 30 09:27:30 CET 2020
This causes char and blockdev mknod() and mknodat() calls to
be forwarded to the seccomp proxy, so unprivileged
containers can finally create /dev/null by themselves.
For now this is experimental and therefore added to
`features`. Ideally, if this works as intended, we can make
it the default in pve 7.
Signed-off-by: Wolfgang Bumiller <w.bumiller at proxmox.com>
---
src/Makefile | 1 -
src/PVE/LXC.pm | 88 +++++++++++++++++++++++++++++++++++--------
src/PVE/LXC/Config.pm | 8 ++++
3 files changed, 80 insertions(+), 17 deletions(-)
diff --git a/src/Makefile b/src/Makefile
index 5e32d38..7166708 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -73,7 +73,6 @@ pve-userns.seccomp: /usr/share/lxc/config/common.seccomp
cp $< $@
echo 'keyctl errno 38' >> $@
-
.PHONY: test
test:
make -C test test
diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index 34949c6..c590fc8 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -19,7 +19,10 @@ use PVE::Storage;
use PVE::SafeSyslog;
use PVE::INotify;
use PVE::JSONSchema qw(get_standard_option);
-use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH AT_FDCWD);
+use PVE::Tools qw(
+ dir_glob_foreach file_get_contents file_set_contents lock_file
+ lock_file_full AT_FDCWD O_PATH $IPV4RE $IPV6RE
+);
use PVE::CpuSet;
use PVE::Network;
use PVE::AccessControl;
@@ -461,21 +464,22 @@ sub get_cgroup_subsystems {
return wantarray ? ($v1, $v2) : $v1;
}
-# Currently we do not need to create seccomp profile 'files' as the only
-# choice our configuration actually allows is "with or without keyctl()",
-# so we distinguish between using lxc's "default" seccomp profile and our
-# added pve-userns.seccomp file.
+# With seccomp trap to userspace we now have the ability to optionally forward
+# certain syscalls to the "host" to handle (via our pve-lxc-syscalld daemon).
#
-# This returns a configuration line added to the raw lxc config.
+# This means that there are cases where we need to create an extra seccomp
+# profile for the container to load.
+#
+# This returns a configuration snippet added to the raw lxc config.
sub make_seccomp_config {
- my ($conf, $unprivileged, $features) = @_;
+ my ($conf, $conf_dir, $unprivileged, $features) = @_;
# User-configured profile has precedence, note that the user's entry would
# be written 'after' this line anyway...
if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) {
# Warn the user if this conflicts with a feature:
- if ($features->{keyctl}) {
- warn "explicitly configured lxc.seccomp.profile overrides the following settings: features:keyctl\n";
- }
+ my $warn = join(', ', grep { $features->{$_} } qw(keyctl mknod));
+ warn "explicitly configured lxc.seccomp.profile overrides the following settings: $warn\n"
+ if length($warn) > 0;
return '';
}
@@ -485,14 +489,66 @@ sub make_seccomp_config {
# well)
return '' if !$unprivileged;
+ my $rules = {
+ keyctl => ['errno 38'],
+ };
+
+ my $raw_conf = '';
+
# Unprivileged containers will get keyctl() disabled by default as a
# workaround for systemd-networkd behavior. But we have an option to
# explicitly enable it:
- return '' if $features->{keyctl};
-
- # Finally we're in an unprivileged container without `keyctl` set
- # explicitly. We have a file prepared for this:
- return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n";
+ if ($features->{keyctl}) {
+ delete $rules->{keyctl};
+ }
+
+ # By default, unprivileged containers cannot use `mknod` at all.
+ # Since lxc 3.2, we can use seccomp's trap to userspace feature for this,
+ # but for now this is experimental, so it has to be enabled via a feature
+ # flag.
+ # Note that we only handle block and char devices (like lxd), the rest we
+ # leave up to the kernel. We may in the future remove this if seccomp gets
+ # a way to tell the kernel to "continue" a syscall.
+ if ($features->{mknod}) {
+ $raw_conf .= "lxc.seccomp.notify.proxy = unix:/run/pve/lxc-syscalld.sock\n";
+
+ $rules->{mknod} = [
+ # condition: (mode & S_IFMT) == S_IFCHR
+ 'notify [1,8192,SCMP_CMP_MASKED_EQ,61440]',
+ # condition: (mode & S_IFMT) == S_IFBLK
+ 'notify [1,24576,SCMP_CMP_MASKED_EQ,61440]',
+ ];
+ $rules->{mknodat} = [
+ # condition: (mode & S_IFMT) == S_IFCHR
+ 'notify [2,8192,SCMP_CMP_MASKED_EQ,61440]',
+ # condition: (mode & S_IFMT) == S_IFBLK
+ 'notify [2,24576,SCMP_CMP_MASKED_EQ,61440]',
+ ];
+ }
+
+ # Now build the custom seccomp rule text...
+ my $extra_rules = join("\n", map {
+ my $syscall = $_;
+ map { "$syscall $_" } $rules->{$syscall}->@*
+ } sort keys %$rules) . "\n";
+
+ return $raw_conf if $extra_rules eq "\n";
+
+ # We still have the "most common" config readily available, so don't write
+ # out that one:
+ if ($raw_conf eq '' && $extra_rules eq "keyctl errno 38\n") {
+ # we have no extra $raw_conf and use the same we had in pve 6.1:
+ return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n";
+ }
+
+ # Write the rule file to the container's config path:
+ my $rule_file = "$conf_dir/rules.seccomp";
+ my $rule_data = file_get_contents("$LXC_CONFIG_PATH/common.seccomp")
+ . $extra_rules;
+ file_set_contents($rule_file, $rule_data);
+ $raw_conf .= "lxc.seccomp.profile = $rule_file\n";
+
+ return $raw_conf;
}
# Since lxc-3.0.2 we can have lxc generate a profile for the container
@@ -588,7 +644,7 @@ sub update_lxc_config {
my $features = PVE::LXC::Config->parse_features($conf->{features});
- $raw .= make_seccomp_config($conf, $unprivileged, $features);
+ $raw .= make_seccomp_config($conf, $dir, $unprivileged, $features);
$raw .= make_apparmor_config($conf, $unprivileged, $features);
if ($features->{fuse}) {
$raw .= "lxc.apparmor.raw = mount fstype=fuse,\n";
diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
index eec6b38..310aba6 100644
--- a/src/PVE/LXC/Config.pm
+++ b/src/PVE/LXC/Config.pm
@@ -323,6 +323,14 @@ my $features_desc = {
description => "Allow using 'fuse' file systems in a container."
." Note that interactions between fuse and the freezer cgroup can potentially cause I/O deadlocks.",
},
+ mknod => {
+ optional => 1,
+ type => 'boolean',
+ default => 0,
+ description => "Allow unprivileged containers to use mknod() to add certain device nodes."
+ ." This requires a kernel with seccomp trap to user space support (5.3 or newer)."
+ ." This is experimental.",
+ },
};
my $confdesc = {
--
2.20.1
More information about the pve-devel
mailing list