[pve-devel] [PATCH zfsonlinux] cherry-pick lock-inversion patch for zvol_open

Stoiko Ivanov s.ivanov at proxmox.com
Tue Jan 11 16:39:36 CET 2022


the changes to zvol_open added to 2.1.2 (for coping with kernel
changes in 5.13) seem to have introduced a lock order inversion [0].

(noticed while reviewing the 2.0.6->2.0.7 changes (the patch was
applied after 2.1.2 was already tagged)

[0] https://github.com/openzfs/zfs/pull/12863
Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
---
did not reproduce the dead-lock myself (in my very limited tests)
but given that our 2.1.2 packages (and kernel modules) have not seen
too much testing in public yet - thought it makes sense to proactively
pull this in

quickly tested a kernel+userspace with this patch on a physical testhost

 .../0012-Fix-zvol_open-lock-inversion.patch   | 212 ++++++++++++++++++
 debian/patches/series                         |   1 +
 2 files changed, 213 insertions(+)
 create mode 100644 debian/patches/0012-Fix-zvol_open-lock-inversion.patch

diff --git a/debian/patches/0012-Fix-zvol_open-lock-inversion.patch b/debian/patches/0012-Fix-zvol_open-lock-inversion.patch
new file mode 100644
index 00000000..eb74550f
--- /dev/null
+++ b/debian/patches/0012-Fix-zvol_open-lock-inversion.patch
@@ -0,0 +1,212 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1 at llnl.gov>
+Date: Fri, 17 Dec 2021 09:52:13 -0800
+Subject: [PATCH] Fix zvol_open() lock inversion
+
+When restructuring the zvol_open() logic for the Linux 5.13 kernel
+a lock inversion was accidentally introduced.  In the updated code
+the spa_namespace_lock is now taken before the zv_suspend_lock
+allowing the following scenario to occur:
+
+    down_read <=== waiting for zv_suspend_lock
+    zvol_open <=== holds spa_namespace_lock
+    __blkdev_get
+    blkdev_get_by_dev
+    blkdev_open
+    ...
+
+     mutex_lock <== waiting for spa_namespace_lock
+     spa_open_common
+     spa_open
+     dsl_pool_hold
+     dmu_objset_hold_flags
+     dmu_objset_hold
+     dsl_prop_get
+     dsl_prop_get_integer
+     zvol_create_minor
+     dmu_recv_end
+     zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
+     zfs_ioc_recv
+     ...
+
+This commit resolves the issue by moving the acquisition of the
+spa_namespace_lock back to after the zv_suspend_lock which restores
+the original ordering.
+
+Additionally, as part of this change the error exit paths were
+simplified where possible.
+
+Reviewed-by: Tony Hutter <hutter2 at llnl.gov>
+Reviewed-by: Rich Ercolani <rincebrain at gmail.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Closes #12863
+(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023)
+Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
+---
+ module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------
+ 1 file changed, 58 insertions(+), 63 deletions(-)
+
+diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
+index 44caadd58..69479b3f7 100644
+--- a/module/os/linux/zfs/zvol_os.c
++++ b/module/os/linux/zfs/zvol_os.c
+@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+ 	zvol_state_t *zv;
+ 	int error = 0;
+-	boolean_t drop_suspend = B_TRUE;
+-	boolean_t drop_namespace = B_FALSE;
++	boolean_t drop_suspend = B_FALSE;
+ #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+ 	hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+ 	hrtime_t start = gethrtime();
+@@ -517,7 +516,36 @@ retry:
+ 		return (SET_ERROR(-ENXIO));
+ 	}
+ 
+-	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
++	mutex_enter(&zv->zv_state_lock);
++	/*
++	 * Make sure zvol is not suspended during first open
++	 * (hold zv_suspend_lock) and respect proper lock acquisition
++	 * ordering - zv_suspend_lock before zv_state_lock
++	 */
++	if (zv->zv_open_count == 0) {
++		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
++			mutex_exit(&zv->zv_state_lock);
++			rw_enter(&zv->zv_suspend_lock, RW_READER);
++			mutex_enter(&zv->zv_state_lock);
++			/* check to see if zv_suspend_lock is needed */
++			if (zv->zv_open_count != 0) {
++				rw_exit(&zv->zv_suspend_lock);
++			} else {
++				drop_suspend = B_TRUE;
++			}
++		} else {
++			drop_suspend = B_TRUE;
++		}
++	}
++	rw_exit(&zvol_state_lock);
++
++	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++
++	if (zv->zv_open_count == 0) {
++		boolean_t drop_namespace = B_FALSE;
++
++		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
++
+ 		/*
+ 		 * In all other call paths the spa_namespace_lock is taken
+ 		 * before the bdev->bd_mutex lock.  However, on open(2)
+@@ -542,84 +570,51 @@ retry:
+ 		 * the kernel so the only option is to return the error for
+ 		 * the caller to handle it.
+ 		 */
+-		if (!mutex_tryenter(&spa_namespace_lock)) {
+-			rw_exit(&zvol_state_lock);
++		if (!mutex_owned(&spa_namespace_lock)) {
++			if (!mutex_tryenter(&spa_namespace_lock)) {
++				mutex_exit(&zv->zv_state_lock);
++				rw_exit(&zv->zv_suspend_lock);
+ 
+ #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+-			schedule();
+-			return (SET_ERROR(-ERESTARTSYS));
+-#else
+-			if ((gethrtime() - start) > timeout)
++				schedule();
+ 				return (SET_ERROR(-ERESTARTSYS));
++#else
++				if ((gethrtime() - start) > timeout)
++					return (SET_ERROR(-ERESTARTSYS));
+ 
+-			schedule_timeout(MSEC_TO_TICK(10));
+-			goto retry;
++				schedule_timeout(MSEC_TO_TICK(10));
++				goto retry;
+ #endif
+-		} else {
+-			drop_namespace = B_TRUE;
+-		}
+-	}
+-
+-	mutex_enter(&zv->zv_state_lock);
+-	/*
+-	 * make sure zvol is not suspended during first open
+-	 * (hold zv_suspend_lock) and respect proper lock acquisition
+-	 * ordering - zv_suspend_lock before zv_state_lock
+-	 */
+-	if (zv->zv_open_count == 0) {
+-		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+-			mutex_exit(&zv->zv_state_lock);
+-			rw_enter(&zv->zv_suspend_lock, RW_READER);
+-			mutex_enter(&zv->zv_state_lock);
+-			/* check to see if zv_suspend_lock is needed */
+-			if (zv->zv_open_count != 0) {
+-				rw_exit(&zv->zv_suspend_lock);
+-				drop_suspend = B_FALSE;
++			} else {
++				drop_namespace = B_TRUE;
+ 			}
+ 		}
+-	} else {
+-		drop_suspend = B_FALSE;
+-	}
+-	rw_exit(&zvol_state_lock);
+-
+-	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ 
+-	if (zv->zv_open_count == 0) {
+-		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ 		error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+-		if (error)
+-			goto out_mutex;
+-	}
+ 
+-	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+-		error = -EROFS;
+-		goto out_open_count;
++		if (drop_namespace)
++			mutex_exit(&spa_namespace_lock);
+ 	}
+ 
+-	zv->zv_open_count++;
+-
+-	mutex_exit(&zv->zv_state_lock);
+-	if (drop_namespace)
+-		mutex_exit(&spa_namespace_lock);
+-	if (drop_suspend)
+-		rw_exit(&zv->zv_suspend_lock);
+-
+-	zfs_check_media_change(bdev);
+-
+-	return (0);
++	if (error == 0) {
++		if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
++			if (zv->zv_open_count == 0)
++				zvol_last_close(zv);
+ 
+-out_open_count:
+-	if (zv->zv_open_count == 0)
+-		zvol_last_close(zv);
++			error = SET_ERROR(-EROFS);
++		} else {
++			zv->zv_open_count++;
++		}
++	}
+ 
+-out_mutex:
+ 	mutex_exit(&zv->zv_state_lock);
+-	if (drop_namespace)
+-		mutex_exit(&spa_namespace_lock);
+ 	if (drop_suspend)
+ 		rw_exit(&zv->zv_suspend_lock);
+ 
+-	return (SET_ERROR(error));
++	if (error == 0)
++		zfs_check_media_change(bdev);
++
++	return (error);
+ }
+ 
+ static void
diff --git a/debian/patches/series b/debian/patches/series
index d2770d39..8166db91 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -9,3 +9,4 @@
 0009-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
 0010-arcstat-Fix-integer-division-with-python3.patch
 0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
+0012-Fix-zvol_open-lock-inversion.patch
-- 
2.30.2






More information about the pve-devel mailing list