[pve-devel] [PATCH zfsonlinux 4/6] Backport deadlock fix for issue ZOL#7939

Stoiko Ivanov s.ivanov at proxmox.com
Tue Oct 30 11:14:27 CET 2018


A deadlock issue got fixed upstream and merged into master [0].
This patch backports the fix by splitting it into the parts for SPL and ZFS.

[0] https://github.com/zfsonlinux/zfs/pull/7939

Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
---
 ...tween-mm_sem-and-tx-assign-in-zfs_write-a.patch |  42 +++
 spl-patches/series                                 |   1 +
 ...tween-mm_sem-and-tx-assign-in-zfs_write-a.patch | 376 +++++++++++++++++++++
 zfs-patches/series                                 |   1 +
 4 files changed, 420 insertions(+)
 create mode 100644 spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
 create mode 100644 zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch

diff --git a/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
new file mode 100644
index 0000000..2906ef8
--- /dev/null
+++ b/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
@@ -0,0 +1,42 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: ilbsmart <wgqimut at gmail.com>
+Date: Wed, 17 Oct 2018 02:11:24 +0800
+Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page
+ fault
+
+The bug time sequence:
+1. thread #1, `zfs_write` assign a txg "n".
+2. In a same process, thread #2, mmap page fault (which means the
+   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
+   and wait previous txg "n" completed.
+3. thread #1 call `uiomove` to write, however page fault is occurred
+   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
+   thread #2, so it stuck and can't complete,  then txg "n" will
+   not complete.
+
+So thread #1 and thread #2 are deadlocked.
+
+Reviewed-by: Chunwei Chen <tuxoko at gmail.com>
+Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Reviewed-by: Matthew Ahrens <mahrens at delphix.com>
+Signed-off-by: Grady Wong <grady.w at xtaotech.com>
+Closes #7939
+
+(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
+Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
+---
+ include/sys/uio.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/include/sys/uio.h b/include/sys/uio.h
+index 764beb9..2895690 100644
+--- a/include/sys/uio.h
++++ b/include/sys/uio.h
+@@ -53,6 +53,7 @@ typedef struct uio {
+ 	int		uio_iovcnt;
+ 	offset_t	uio_loffset;
+ 	uio_seg_t	uio_segflg;
++	boolean_t	uio_fault_disable;
+ 	uint16_t	uio_fmode;
+ 	uint16_t	uio_extflg;
+ 	offset_t	uio_limit;
diff --git a/spl-patches/series b/spl-patches/series
index 20724b7..5bcfa6a 100644
--- a/spl-patches/series
+++ b/spl-patches/series
@@ -1 +1,2 @@
 0001-remove-DKMS-and-module-build.patch
+0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
diff --git a/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
new file mode 100644
index 0000000..6872d8b
--- /dev/null
+++ b/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
@@ -0,0 +1,376 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: ilbsmart <wgqimut at gmail.com>
+Date: Wed, 17 Oct 2018 02:11:24 +0800
+Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page
+ fault
+
+The bug time sequence:
+1. thread #1, `zfs_write` assign a txg "n".
+2. In a same process, thread #2, mmap page fault (which means the
+   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
+   and wait previous txg "n" completed.
+3. thread #1 call `uiomove` to write, however page fault is occurred
+   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
+   thread #2, so it stuck and can't complete,  then txg "n" will
+   not complete.
+
+So thread #1 and thread #2 are deadlocked.
+
+Reviewed-by: Chunwei Chen <tuxoko at gmail.com>
+Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Reviewed-by: Matthew Ahrens <mahrens at delphix.com>
+Signed-off-by: Grady Wong <grady.w at xtaotech.com>
+Closes #7939
+
+(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
+Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
+---
+ include/sys/uio_impl.h                             |   2 +-
+ module/zcommon/zfs_uio.c                           |  31 ++++-
+ module/zfs/zfs_vnops.c                             |  24 +++-
+ tests/zfs-tests/cmd/mmapwrite/mmapwrite.c          | 140 +++++++++++++++------
+ .../tests/functional/mmap/mmap_write_001_pos.ksh   |   8 +-
+ 5 files changed, 151 insertions(+), 54 deletions(-)
+
+diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h
+index 37e283da..cfef0b95 100644
+--- a/include/sys/uio_impl.h
++++ b/include/sys/uio_impl.h
+@@ -42,7 +42,7 @@
+ #include <sys/uio.h>
+ 
+ extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
+-extern void uio_prefaultpages(ssize_t, uio_t *);
++extern int uio_prefaultpages(ssize_t, uio_t *);
+ extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
+ extern void uioskip(uio_t *, size_t);
+ 
+diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c
+index 7b4175bb..8e969bbc 100644
+--- a/module/zcommon/zfs_uio.c
++++ b/module/zcommon/zfs_uio.c
+@@ -50,6 +50,7 @@
+ #include <sys/types.h>
+ #include <sys/uio_impl.h>
+ #include <linux/kmap_compat.h>
++#include <linux/uaccess.h>
+ 
+ /*
+  * Move "n" bytes at byte address "p"; "rw" indicates the direction
+@@ -77,8 +78,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
+ 				if (copy_to_user(iov->iov_base+skip, p, cnt))
+ 					return (EFAULT);
+ 			} else {
+-				if (copy_from_user(p, iov->iov_base+skip, cnt))
+-					return (EFAULT);
++				if (uio->uio_fault_disable) {
++					if (!access_ok(VERIFY_READ,
++					    (iov->iov_base + skip), cnt)) {
++						return (EFAULT);
++					}
++
++					pagefault_disable();
++					if (__copy_from_user_inatomic(p,
++					    (iov->iov_base + skip), cnt)) {
++						pagefault_enable();
++						return (EFAULT);
++					}
++					pagefault_enable();
++				} else {
++					if (copy_from_user(p,
++					    (iov->iov_base + skip), cnt))
++						return (EFAULT);
++				}
+ 			}
+ 			break;
+ 		case UIO_SYSSPACE:
+@@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove);
+  * error will terminate the process as this is only a best attempt to get
+  * the pages resident.
+  */
+-void
++int
+ uio_prefaultpages(ssize_t n, struct uio *uio)
+ {
+ 	const struct iovec *iov;
+@@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+ 	switch (uio->uio_segflg) {
+ 		case UIO_SYSSPACE:
+ 		case UIO_BVEC:
+-			return;
++			return (0);
+ 		case UIO_USERSPACE:
+ 		case UIO_USERISPACE:
+ 			break;
+@@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+ 		p = iov->iov_base + skip;
+ 		while (cnt) {
+ 			if (fuword8((uint8_t *)p, &tmp))
+-				return;
++				return (EFAULT);
+ 			incr = MIN(cnt, PAGESIZE);
+ 			p += incr;
+ 			cnt -= incr;
+@@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+ 		 */
+ 		p--;
+ 		if (fuword8((uint8_t *)p, &tmp))
+-			return;
++			return (EFAULT);
+ 	}
++
++	return (0);
+ }
+ EXPORT_SYMBOL(uio_prefaultpages);
+ 
+diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
+index 5a2e55eb..c866352d 100644
+--- a/module/zfs/zfs_vnops.c
++++ b/module/zfs/zfs_vnops.c
+@@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+ 		xuio = (xuio_t *)uio;
+ 	else
+ #endif
+-		uio_prefaultpages(MIN(n, max_blksz), uio);
++		if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++			ZFS_EXIT(zfsvfs);
++			return (SET_ERROR(EFAULT));
++		}
+ 
+ 	/*
+ 	 * If in append mode, set the io offset pointer to eof.
+@@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+ 
+ 		if (abuf == NULL) {
+ 			tx_bytes = uio->uio_resid;
++			uio->uio_fault_disable = B_TRUE;
+ 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ 			    uio, nbytes, tx);
++			if (error == EFAULT) {
++				dmu_tx_commit(tx);
++				if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++					break;
++				}
++				continue;
++			} else if (error != 0) {
++				dmu_tx_commit(tx);
++				break;
++			}
+ 			tx_bytes -= uio->uio_resid;
+ 		} else {
+ 			tx_bytes = nbytes;
+@@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+ 		ASSERT(tx_bytes == nbytes);
+ 		n -= nbytes;
+ 
+-		if (!xuio && n > 0)
+-			uio_prefaultpages(MIN(n, max_blksz), uio);
++		if (!xuio && n > 0) {
++			if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++				error = EFAULT;
++				break;
++			}
++		}
+ 	}
+ 
+ 	zfs_inode_update(zp);
+diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
+index 190d31af..b9915d5d 100644
+--- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
++++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
+@@ -31,74 +31,132 @@
+ #include <string.h>
+ #include <sys/mman.h>
+ #include <pthread.h>
++#include <errno.h>
++#include <err.h>
+ 
+ /*
+  * --------------------------------------------------------------------
+- * Bug Id: 5032643
++ * Bug Issue Id: #7512
++ * The bug time sequence:
++ * 1. context #1, zfs_write assign a txg "n".
++ * 2. In the same process, context #2, mmap page fault (which means the mm_sem
++ *    is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
++ *    txg "n" completed.
++ * 3. context #1 call uiomove to write, however page fault is occurred in
++ *    uiomove, which means it need mm_sem, but mm_sem is hold by
++ *    context #2, so it stuck and can't complete, then txg "n" will not
++ *    complete.
+  *
+- * Simply writing to a file and mmaping that file at the same time can
+- * result in deadlock.  Nothing perverse like writing from the file's
+- * own mapping is required.
++ * So context #1 and context #2 trap into the "dead lock".
+  * --------------------------------------------------------------------
+  */
+ 
++#define	NORMAL_WRITE_TH_NUM	2
++
+ static void *
+-mapper(void *fdp)
++normal_writer(void *filename)
+ {
+-	void *addr;
+-	int fd = *(int *)fdp;
++	char *file_path = filename;
++	int fd = -1;
++	ssize_t write_num = 0;
++	int page_size = getpagesize();
+ 
+-	if ((addr =
+-	    mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
+-		perror("mmap");
+-		exit(1);
++	fd = open(file_path, O_RDWR | O_CREAT, 0777);
++	if (fd == -1) {
++		err(1, "failed to open %s", file_path);
+ 	}
+-	for (;;) {
+-		if (mmap(addr, 8192, PROT_READ,
+-		    MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
+-			perror("mmap");
+-			exit(1);
++
++	char *buf = malloc(1);
++	while (1) {
++		write_num = write(fd, buf, 1);
++		if (write_num == 0) {
++			err(1, "write failed!");
++			break;
+ 		}
++		lseek(fd, page_size, SEEK_CUR);
++	}
++
++	if (buf) {
++		free(buf);
+ 	}
+-	/* NOTREACHED */
+-	return ((void *)1);
+ }
+ 
+-int
+-main(int argc, char **argv)
++static void *
++map_writer(void *filename)
+ {
+-	int fd;
+-	char buf[1024];
+-	pthread_t tid;
++	int fd = -1;
++	int ret = 0;
++	char *buf = NULL;
++	int page_size = getpagesize();
++	int op_errno = 0;
++	char *file_path = filename;
+ 
+-	memset(buf, 'a', sizeof (buf));
++	while (1) {
++		ret = access(file_path, F_OK);
++		if (ret) {
++			op_errno = errno;
++			if (op_errno == ENOENT) {
++				fd = open(file_path, O_RDWR | O_CREAT, 0777);
++				if (fd == -1) {
++					err(1, "open file failed");
++				}
+ 
+-	if (argc != 2) {
+-		(void) printf("usage: %s <file name>\n", argv[0]);
+-		exit(1);
+-	}
++				ret = ftruncate(fd, page_size);
++				if (ret == -1) {
++					err(1, "truncate file failed");
++				}
++			} else {
++				err(1, "access file failed!");
++			}
++		} else {
++			fd = open(file_path, O_RDWR, 0777);
++			if (fd == -1) {
++				err(1, "open file failed");
++			}
++		}
+ 
+-	if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
+-		perror("open");
+-		exit(1);
++		if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
++		    MAP_SHARED, fd, 0)) == MAP_FAILED) {
++			err(1, "map file failed");
++		}
++
++		if (fd != -1)
++			close(fd);
++
++		char s[10] = {0, };
++		memcpy(buf, s, 10);
++		ret = munmap(buf, page_size);
++		if (ret != 0) {
++			err(1, "unmap file failed");
++		}
+ 	}
++}
+ 
+-	(void) pthread_setconcurrency(2);
+-	if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
+-		perror("pthread_create");
+-		close(fd);
++int
++main(int argc, char **argv)
++{
++	pthread_t map_write_tid;
++	pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
++	int i = 0;
++
++	if (argc != 3) {
++		(void) printf("usage: %s <normal write file name>"
++		    "<map write file name>\n", argv[0]);
+ 		exit(1);
+ 	}
+-	for (;;) {
+-		if (write(fd, buf, sizeof (buf)) == -1) {
+-			perror("write");
+-			close(fd);
+-			exit(1);
++
++	for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
++		if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
++		    argv[1])) {
++			err(1, "pthread_create normal_writer failed.");
+ 		}
+ 	}
+ 
+-	close(fd);
++	if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
++		err(1, "pthread_create map_writer failed.");
++	}
+ 
+ 	/* NOTREACHED */
++	pthread_join(map_write_tid, NULL);
+ 	return (0);
+ }
+diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
+index 1eda9710..24150b82 100755
+--- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
++++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
+@@ -53,12 +53,14 @@ if ! is_mp; then
+ fi
+ 
+ log_must chmod 777 $TESTDIR
+-mmapwrite $TESTDIR/test-write-file &
++mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
+ PID_MMAPWRITE=$!
+-log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
++log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
++	 "pid: $PID_MMAPWRITE"
+ log_must sleep 30
+ 
+ log_must kill -9 $PID_MMAPWRITE
+-log_must ls -l $TESTDIR/test-write-file
++log_must ls -l $TESTDIR/normal_write_file
++log_must ls -l $TESTDIR/map_write_file
+ 
+ log_pass "write(2) a mmap(2)'ing file succeeded."
diff --git a/zfs-patches/series b/zfs-patches/series
index 3b4626f..1146f65 100644
--- a/zfs-patches/series
+++ b/zfs-patches/series
@@ -4,3 +4,4 @@
 0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
 0005-Fix-race-in-dnode_check_slots_free.patch
 0006-Reduce-taskq-and-context-switch-cost-of-zio-pipe.patch
+0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
-- 
2.11.0





More information about the pve-devel mailing list