[pve-devel] [PATCH zfsonlinux v2 4/5] Backport deadlock fix for issue ZOL#7939
Stoiko Ivanov
s.ivanov at proxmox.com
Wed Oct 31 16:07:12 CET 2018
A deadlock issue got fixed upstream and merged into master [0].
This patch backports the fix by splitting it into the parts for SPL and ZFS.
[0] https://github.com/zfsonlinux/zfs/pull/7939
Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
---
...tween-mm_sem-and-tx-assign-in-zfs_write-a.patch | 42 +++
spl-patches/series | 1 +
...tween-mm_sem-and-tx-assign-in-zfs_write-a.patch | 376 +++++++++++++++++++++
zfs-patches/series | 1 +
4 files changed, 420 insertions(+)
create mode 100644 spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
create mode 100644 zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
diff --git a/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
new file mode 100644
index 0000000..2906ef8
--- /dev/null
+++ b/spl-patches/0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
@@ -0,0 +1,42 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: ilbsmart <wgqimut at gmail.com>
+Date: Wed, 17 Oct 2018 02:11:24 +0800
+Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page
+ fault
+
+The bug time sequence:
+1. thread #1, `zfs_write` assign a txg "n".
+2. In a same process, thread #2, mmap page fault (which means the
+ `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
+ and wait previous txg "n" completed.
+3. thread #1 call `uiomove` to write, however page fault is occurred
+ in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
+ thread #2, so it stuck and can't complete, then txg "n" will
+ not complete.
+
+So thread #1 and thread #2 are deadlocked.
+
+Reviewed-by: Chunwei Chen <tuxoko at gmail.com>
+Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Reviewed-by: Matthew Ahrens <mahrens at delphix.com>
+Signed-off-by: Grady Wong <grady.w at xtaotech.com>
+Closes #7939
+
+(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
+Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
+---
+ include/sys/uio.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/include/sys/uio.h b/include/sys/uio.h
+index 764beb9..2895690 100644
+--- a/include/sys/uio.h
++++ b/include/sys/uio.h
+@@ -53,6 +53,7 @@ typedef struct uio {
+ int uio_iovcnt;
+ offset_t uio_loffset;
+ uio_seg_t uio_segflg;
++ boolean_t uio_fault_disable;
+ uint16_t uio_fmode;
+ uint16_t uio_extflg;
+ offset_t uio_limit;
diff --git a/spl-patches/series b/spl-patches/series
index 20724b7..5bcfa6a 100644
--- a/spl-patches/series
+++ b/spl-patches/series
@@ -1 +1,2 @@
0001-remove-DKMS-and-module-build.patch
+0002-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
diff --git a/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch b/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
new file mode 100644
index 0000000..6872d8b
--- /dev/null
+++ b/zfs-patches/0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
@@ -0,0 +1,376 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: ilbsmart <wgqimut at gmail.com>
+Date: Wed, 17 Oct 2018 02:11:24 +0800
+Subject: [PATCH] deadlock between mm_sem and tx assign in zfs_write() and page
+ fault
+
+The bug time sequence:
+1. thread #1, `zfs_write` assign a txg "n".
+2. In a same process, thread #2, mmap page fault (which means the
+ `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
+ and wait previous txg "n" completed.
+3. thread #1 call `uiomove` to write, however page fault is occurred
+ in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
+ thread #2, so it stuck and can't complete, then txg "n" will
+ not complete.
+
+So thread #1 and thread #2 are deadlocked.
+
+Reviewed-by: Chunwei Chen <tuxoko at gmail.com>
+Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Reviewed-by: Matthew Ahrens <mahrens at delphix.com>
+Signed-off-by: Grady Wong <grady.w at xtaotech.com>
+Closes #7939
+
+(backported from: zfs-upstream 779a6c0bf6df76e0dd92c1ccf81f48512b835bb0)
+Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
+---
+ include/sys/uio_impl.h | 2 +-
+ module/zcommon/zfs_uio.c | 31 ++++-
+ module/zfs/zfs_vnops.c | 24 +++-
+ tests/zfs-tests/cmd/mmapwrite/mmapwrite.c | 140 +++++++++++++++------
+ .../tests/functional/mmap/mmap_write_001_pos.ksh | 8 +-
+ 5 files changed, 151 insertions(+), 54 deletions(-)
+
+diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h
+index 37e283da..cfef0b95 100644
+--- a/include/sys/uio_impl.h
++++ b/include/sys/uio_impl.h
+@@ -42,7 +42,7 @@
+ #include <sys/uio.h>
+
+ extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
+-extern void uio_prefaultpages(ssize_t, uio_t *);
++extern int uio_prefaultpages(ssize_t, uio_t *);
+ extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
+ extern void uioskip(uio_t *, size_t);
+
+diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c
+index 7b4175bb..8e969bbc 100644
+--- a/module/zcommon/zfs_uio.c
++++ b/module/zcommon/zfs_uio.c
+@@ -50,6 +50,7 @@
+ #include <sys/types.h>
+ #include <sys/uio_impl.h>
+ #include <linux/kmap_compat.h>
++#include <linux/uaccess.h>
+
+ /*
+ * Move "n" bytes at byte address "p"; "rw" indicates the direction
+@@ -77,8 +78,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
+ if (copy_to_user(iov->iov_base+skip, p, cnt))
+ return (EFAULT);
+ } else {
+- if (copy_from_user(p, iov->iov_base+skip, cnt))
+- return (EFAULT);
++ if (uio->uio_fault_disable) {
++ if (!access_ok(VERIFY_READ,
++ (iov->iov_base + skip), cnt)) {
++ return (EFAULT);
++ }
++
++ pagefault_disable();
++ if (__copy_from_user_inatomic(p,
++ (iov->iov_base + skip), cnt)) {
++ pagefault_enable();
++ return (EFAULT);
++ }
++ pagefault_enable();
++ } else {
++ if (copy_from_user(p,
++ (iov->iov_base + skip), cnt))
++ return (EFAULT);
++ }
+ }
+ break;
+ case UIO_SYSSPACE:
+@@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove);
+ * error will terminate the process as this is only a best attempt to get
+ * the pages resident.
+ */
+-void
++int
+ uio_prefaultpages(ssize_t n, struct uio *uio)
+ {
+ const struct iovec *iov;
+@@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+ switch (uio->uio_segflg) {
+ case UIO_SYSSPACE:
+ case UIO_BVEC:
+- return;
++ return (0);
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ break;
+@@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+ p = iov->iov_base + skip;
+ while (cnt) {
+ if (fuword8((uint8_t *)p, &tmp))
+- return;
++ return (EFAULT);
+ incr = MIN(cnt, PAGESIZE);
+ p += incr;
+ cnt -= incr;
+@@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
+ */
+ p--;
+ if (fuword8((uint8_t *)p, &tmp))
+- return;
++ return (EFAULT);
+ }
++
++ return (0);
+ }
+ EXPORT_SYMBOL(uio_prefaultpages);
+
+diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
+index 5a2e55eb..c866352d 100644
+--- a/module/zfs/zfs_vnops.c
++++ b/module/zfs/zfs_vnops.c
+@@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+ xuio = (xuio_t *)uio;
+ else
+ #endif
+- uio_prefaultpages(MIN(n, max_blksz), uio);
++ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++ ZFS_EXIT(zfsvfs);
++ return (SET_ERROR(EFAULT));
++ }
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+@@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+
+ if (abuf == NULL) {
+ tx_bytes = uio->uio_resid;
++ uio->uio_fault_disable = B_TRUE;
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
++ if (error == EFAULT) {
++ dmu_tx_commit(tx);
++ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++ break;
++ }
++ continue;
++ } else if (error != 0) {
++ dmu_tx_commit(tx);
++ break;
++ }
+ tx_bytes -= uio->uio_resid;
+ } else {
+ tx_bytes = nbytes;
+@@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+ ASSERT(tx_bytes == nbytes);
+ n -= nbytes;
+
+- if (!xuio && n > 0)
+- uio_prefaultpages(MIN(n, max_blksz), uio);
++ if (!xuio && n > 0) {
++ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
++ error = EFAULT;
++ break;
++ }
++ }
+ }
+
+ zfs_inode_update(zp);
+diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
+index 190d31af..b9915d5d 100644
+--- a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
++++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
+@@ -31,74 +31,132 @@
+ #include <string.h>
+ #include <sys/mman.h>
+ #include <pthread.h>
++#include <errno.h>
++#include <err.h>
+
+ /*
+ * --------------------------------------------------------------------
+- * Bug Id: 5032643
++ * Bug Issue Id: #7512
++ * The bug time sequence:
++ * 1. context #1, zfs_write assign a txg "n".
++ * 2. In the same process, context #2, mmap page fault (which means the mm_sem
++ * is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
++ * txg "n" completed.
++ * 3. context #1 call uiomove to write, however page fault is occurred in
++ * uiomove, which means it need mm_sem, but mm_sem is hold by
++ * context #2, so it stuck and can't complete, then txg "n" will not
++ * complete.
+ *
+- * Simply writing to a file and mmaping that file at the same time can
+- * result in deadlock. Nothing perverse like writing from the file's
+- * own mapping is required.
++ * So context #1 and context #2 trap into the "dead lock".
+ * --------------------------------------------------------------------
+ */
+
++#define NORMAL_WRITE_TH_NUM 2
++
+ static void *
+-mapper(void *fdp)
++normal_writer(void *filename)
+ {
+- void *addr;
+- int fd = *(int *)fdp;
++ char *file_path = filename;
++ int fd = -1;
++ ssize_t write_num = 0;
++ int page_size = getpagesize();
+
+- if ((addr =
+- mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
+- perror("mmap");
+- exit(1);
++ fd = open(file_path, O_RDWR | O_CREAT, 0777);
++ if (fd == -1) {
++ err(1, "failed to open %s", file_path);
+ }
+- for (;;) {
+- if (mmap(addr, 8192, PROT_READ,
+- MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
+- perror("mmap");
+- exit(1);
++
++ char *buf = malloc(1);
++ while (1) {
++ write_num = write(fd, buf, 1);
++ if (write_num == 0) {
++ err(1, "write failed!");
++ break;
+ }
++ lseek(fd, page_size, SEEK_CUR);
++ }
++
++ if (buf) {
++ free(buf);
+ }
+- /* NOTREACHED */
+- return ((void *)1);
+ }
+
+-int
+-main(int argc, char **argv)
++static void *
++map_writer(void *filename)
+ {
+- int fd;
+- char buf[1024];
+- pthread_t tid;
++ int fd = -1;
++ int ret = 0;
++ char *buf = NULL;
++ int page_size = getpagesize();
++ int op_errno = 0;
++ char *file_path = filename;
+
+- memset(buf, 'a', sizeof (buf));
++ while (1) {
++ ret = access(file_path, F_OK);
++ if (ret) {
++ op_errno = errno;
++ if (op_errno == ENOENT) {
++ fd = open(file_path, O_RDWR | O_CREAT, 0777);
++ if (fd == -1) {
++ err(1, "open file failed");
++ }
+
+- if (argc != 2) {
+- (void) printf("usage: %s <file name>\n", argv[0]);
+- exit(1);
+- }
++ ret = ftruncate(fd, page_size);
++ if (ret == -1) {
++ err(1, "truncate file failed");
++ }
++ } else {
++ err(1, "access file failed!");
++ }
++ } else {
++ fd = open(file_path, O_RDWR, 0777);
++ if (fd == -1) {
++ err(1, "open file failed");
++ }
++ }
+
+- if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
+- perror("open");
+- exit(1);
++ if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
++ MAP_SHARED, fd, 0)) == MAP_FAILED) {
++ err(1, "map file failed");
++ }
++
++ if (fd != -1)
++ close(fd);
++
++ char s[10] = {0, };
++ memcpy(buf, s, 10);
++ ret = munmap(buf, page_size);
++ if (ret != 0) {
++ err(1, "unmap file failed");
++ }
+ }
++}
+
+- (void) pthread_setconcurrency(2);
+- if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
+- perror("pthread_create");
+- close(fd);
++int
++main(int argc, char **argv)
++{
++ pthread_t map_write_tid;
++ pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
++ int i = 0;
++
++ if (argc != 3) {
++ (void) printf("usage: %s <normal write file name>"
++ "<map write file name>\n", argv[0]);
+ exit(1);
+ }
+- for (;;) {
+- if (write(fd, buf, sizeof (buf)) == -1) {
+- perror("write");
+- close(fd);
+- exit(1);
++
++ for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
++ if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
++ argv[1])) {
++ err(1, "pthread_create normal_writer failed.");
+ }
+ }
+
+- close(fd);
++ if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
++ err(1, "pthread_create map_writer failed.");
++ }
+
+ /* NOTREACHED */
++ pthread_join(map_write_tid, NULL);
+ return (0);
+ }
+diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
+index 1eda9710..24150b82 100755
+--- a/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
++++ b/tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh
+@@ -53,12 +53,14 @@ if ! is_mp; then
+ fi
+
+ log_must chmod 777 $TESTDIR
+-mmapwrite $TESTDIR/test-write-file &
++mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
+ PID_MMAPWRITE=$!
+-log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
++log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
++ "pid: $PID_MMAPWRITE"
+ log_must sleep 30
+
+ log_must kill -9 $PID_MMAPWRITE
+-log_must ls -l $TESTDIR/test-write-file
++log_must ls -l $TESTDIR/normal_write_file
++log_must ls -l $TESTDIR/map_write_file
+
+ log_pass "write(2) a mmap(2)'ing file succeeded."
diff --git a/zfs-patches/series b/zfs-patches/series
index 3b4626f..1146f65 100644
--- a/zfs-patches/series
+++ b/zfs-patches/series
@@ -4,3 +4,4 @@
0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
0005-Fix-race-in-dnode_check_slots_free.patch
0006-Reduce-taskq-and-context-switch-cost-of-zio-pipe.patch
+0007-deadlock-between-mm_sem-and-tx-assign-in-zfs_write-a.patch
--
2.11.0
More information about the pve-devel
mailing list