[pve-devel] [PATCH container] bindmount: catch rw/ro race and add tests
Wolfgang Bumiller
w.bumiller at proxmox.com
Thu Jun 2 16:23:09 CEST 2016
rw/ro race occurs when a container contains the same bind
mount twice and another container contains a bind mount
containing the first container's destination. If the double
bind mounts are both meant to be read-only then the second
container could theoretically swap them out between the
mount and the read-only remount call, then swap them back
for the test. So to verify this we use the same file
descriptor we use for the dev/inode check and perform an
faccessat() call and expect it to return EROFS and nothing
else.
Also include O_NOFOLLOW in the checks' openat() calls.
---
src/PVE/LXC.pm | 90 ++++++++++++++++++++++++++++++++++------------
src/test/Makefile | 5 ++-
src/test/bindmount_test.pl | 89 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 160 insertions(+), 24 deletions(-)
create mode 100755 src/test/bindmount_test.pl
diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index 7c5a5e6..3a7d2a2 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -11,7 +11,7 @@ use File::Path;
use File::Spec;
use Cwd qw();
use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY);
-use Errno qw(ELOOP);
+use Errno qw(ELOOP EROFS);
use PVE::Exception qw(raise_perm_exc);
use PVE::Storage;
@@ -1044,49 +1044,95 @@ sub walk_tree_nofollow($$$) {
return $fd;
}
-sub bindmount {
- my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_;
-
- my $srcdh = walk_tree_nofollow('/', $dir, 0);
-
- PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]);
- if ($ro) {
- eval { PVE::Tools::run_command(['mount', '-o', 'bind,remount,ro', $dest]); };
- if (my $err = $@) {
- warn "bindmount error\n";
- # don't leave writable bind-mounts behind...
- PVE::Tools::run_command(['umount', $dest]);
- die $err;
- }
- }
+# To guard against symlink attack races against other currently running
+# containers with shared recursive bind mount hierarchies we prepare a
+# directory handle for the directory we're mounting over to verify the
+# mountpoint afterwards.
+sub __bindmount_prepare {
+ my ($hostroot, $dir) = @_;
+ my $srcdh = walk_tree_nofollow($hostroot, $dir, 0);
+ return $srcdh;
+}
+# Assuming we mount to rootfs/a/b/c, verify with the directory handle to 'b'
+# ($parentfd) that 'b/c' (openat($parentfd, 'c')) really leads to the directory
+# we intended to bind mount.
+sub __bindmount_verify {
+ my ($srcdh, $parentfd, $last_dir, $ro) = @_;
my $destdh;
if ($parentfd) {
# Open the mount point path coming from the parent directory since the
# filehandle we would have gotten as first result of walk_tree_nofollow
# earlier is still a handle to the underlying directory instead of the
# mounted path.
- $destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH)
+ $destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH | O_NOFOLLOW | O_DIRECTORY);
+ die "failed to open mount point: $!\n" if !$destdh;
+ if ($ro) {
+ my $dot = '.';
+ # 269: faccessat()
+ # no separate function because 99% of the time it's the wrong thing to use.
+ if (syscall(269, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) {
+ die "failed to mark bind mount read only\n";
+ }
+ die "read-only check failed: $!\n" if $! != EROFS;
+ }
} else {
# For the rootfs we don't have a parentfd so we open the path directly.
# Note that this means bindmounting any prefix of the host's
# /var/lib/lxc/$vmid path into another container is considered a grave
# security error.
sysopen $destdh, $last_dir, O_PATH | O_DIRECTORY;
+ die "failed to open mount point: $!\n" if !$destdh;
}
- die "failed to open directory $dir: $!\n" if !$destdh;
my ($srcdev, $srcinode) = stat($srcdh);
my ($dstdev, $dstinode) = stat($destdh);
close $srcdh;
close $destdh;
- if ($srcdev != $dstdev || $srcinode != $dstinode) {
+ return ($srcdev == $dstdev && $srcinode == $dstinode);
+}
+
+# Perform the actual bind mounting:
+sub __bindmount_do {
+ my ($dir, $dest, $ro, @extra_opts) = @_;
+ PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]);
+ if ($ro) {
+ eval { PVE::Tools::run_command(['mount', '-o', 'bind,remount,ro', $dest]); };
+ if (my $err = $@) {
+ warn "bindmount error\n";
+ # don't leave writable bind-mounts behind...
+ PVE::Tools::run_command(['umount', $dest]);
+ die $err;
+ }
+ }
+}
+
+sub bindmount {
+ my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_;
+
+ my $srcdh = __bindmount_prepare('/', $dir);
+
+ __bindmount_do($dir, $dest, $ro, @extra_opts);
+
+ if (!__bindmount_verify($srcdh, $parentfd, $last_dir, $ro)) {
PVE::Tools::run_command(['umount', $dest]);
die "detected mount path change at: $dir\n";
}
}
+# Cleanup $rootdir a bit (double and trailing slashes), build the mount path
+# from $rootdir and $mount and walk the path from $rootdir to the final
+# directory to check for symlinks.
+sub __mount_prepare_rootdir {
+ my ($rootdir, $mount) = @_;
+ $rootdir =~ s!/+!/!g;
+ $rootdir =~ s!/+$!!;
+ my $mount_path = "$rootdir/$mount";
+ my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1);
+ return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir);
+}
+
# use $rootdir = undef to just return the corresponding mount path
sub mountpoint_mount {
my ($mountpoint, $rootdir, $storage_cfg, $snapname) = @_;
@@ -1105,10 +1151,8 @@ sub mountpoint_mount {
my ($mpfd, $parentfd, $last_dir);
if (defined($rootdir)) {
- $rootdir =~ s!/+!/!g;
- $rootdir =~ s!/+$!!;
- $mount_path = "$rootdir/$mount";
- ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1);
+ ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) =
+ __mount_prepare_rootdir($rootdir, $mount);
}
my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
diff --git a/src/test/Makefile b/src/test/Makefile
index f3e260d..497a179 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -1,7 +1,7 @@
all:
-test: test_setup test_snapshot
+test: test_setup test_snapshot test_bindmount
test_setup: run_setup_tests.pl
./run_setup_tests.pl
@@ -9,5 +9,8 @@ test_setup: run_setup_tests.pl
test_snapshot: run_snapshot_tests.pl
./run_snapshot_tests.pl
+test_bindmount: bindmount_test.pl
+ ./bindmount_test.pl
+
clean:
rm -rf tmprootfs
diff --git a/src/test/bindmount_test.pl b/src/test/bindmount_test.pl
new file mode 100755
index 0000000..a6052db
--- /dev/null
+++ b/src/test/bindmount_test.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use Cwd;
+use File::Path;
+
+use lib qw(..);
+
+use PVE::LXC;
+
+my $pwd = getcwd();
+
+my $rootdir = "./tmproot";
+my $destdir = "/mnt";
+
+my $sharedir = "/tmpshare";
+my $a = "/tmpshare/a";
+my $ab = "/tmpshare/a/b";
+my $abc = "/tmpshare/a/b/c";
+my $sym = "/tmpshare/sym";
+
+my $secret = "/secret";
+
+END {
+ my $ignore_error;
+ File::Path::rmtree("$pwd/$rootdir", {error => \$ignore_error});
+ File::Path::rmtree("$pwd/$sharedir", {error => \$ignore_error});
+ File::Path::rmtree("$pwd/$secret", {error => \$ignore_error});
+};
+
+# Create all the test paths...
+PVE::LXC::walk_tree_nofollow('.', $rootdir, 1);
+PVE::LXC::walk_tree_nofollow($rootdir, $destdir, 1);
+PVE::LXC::walk_tree_nofollow('.', $abc, 1);
+PVE::LXC::walk_tree_nofollow('.', $secret, 1);
+# Create one evil symlink
+symlink('a/b', "$pwd/$sym") or die "failed to prepare test folders: $!\n";
+
+# Test walk_tree_nofollow:
+eval { PVE::LXC::walk_tree_nofollow('.', $rootdir, 0) };
+die "unexpected error: $@" if $@;
+eval { PVE::LXC::walk_tree_nofollow('.', "$sym/c", 0) };
+die "failed to catch symlink at $sym/c\n" if !$@;
+die "unexpected test error: '$@'\n" if $@ ne "symlink encountered at: .$sym\n";
+
+# Bindmount testing:
+sub bindmount {
+ my ($from, $rootdir, $destdir, $inject, $ro, $inject_write) = @_;
+
+ my ($mpath, $mpfd, $parentfd, $last);
+ ($rootdir, $mpath, $mpfd, $parentfd, $last) =
+ PVE::LXC::__mount_prepare_rootdir($rootdir, $destdir);
+
+ my $srcdh = PVE::LXC::__bindmount_prepare('.', $from);
+
+ if ($inject) {
+ File::Path::rmtree(".$inject");
+ symlink("$pwd/$secret", ".$inject")
+ or die "failed to create symlink\n";
+ PVE::LXC::walk_tree_nofollow('.', $from, 1);
+ }
+ PVE::LXC::__bindmount_do(".$from", $mpath, $inject_write ? 0 : $ro);
+
+ my $res = PVE::LXC::__bindmount_verify($srcdh, $parentfd, $last, $ro);
+
+ system('umount', $mpath);
+}
+
+bindmount($a, $rootdir, $destdir, undef, 0, 0);
+eval { bindmount($sym, $rootdir, $destdir, undef, 0, 0) };
+die "illegal symlink bindmount went through\n" if !$@;
+bindmount($abc, $rootdir, $destdir, undef, 0, 0);
+# Race test: Assume someone exchanged 2 equivalent bind mounts between and
+# after the bindmount_do()'s mount and remount calls.
+# First: non-ro mount, should pass
+bindmount($abc, $rootdir, $destdir, undef, 0, 1);
+# Second: regular read-only mount, should pass.
+bindmount($abc, $rootdir, $destdir, undef, 1, 0);
+# Third: read-only requested with read-write injected
+eval { bindmount($abc, $rootdir, $destdir, undef, 1, 1) };
+die "read-write mount possible\n" if !$@;
+die "unexpected test error: $@\n" if $@ ne "failed to mark bind mount read only\n";
+# Race test: Replace /tmpshare/a/b with a symlink to /secret just before
+# __bindmount_do().
+eval { bindmount($abc, $rootdir, $destdir, $ab, 0, 0) };
+die "injected symlink bindmount went through\n" if !$@;
+die "unexpected test error: $@\n" if $@ ne "symlink encountered at: .$ab\n";
--
2.1.4
More information about the pve-devel
mailing list