[pve-devel] [PATCH zfsonlinux] cherry-pick lock-inversion patch for zvol_open
Stoiko Ivanov
s.ivanov at proxmox.com
Tue Jan 11 16:39:36 CET 2022
the changes to zvol_open added to 2.1.2 (for coping with kernel
changes in 5.13) seem to have introduced a lock order inversion [0].
(noticed while reviewing the 2.0.6->2.0.7 changes (the patch was
applied after 2.1.2 was already tagged)
[0] https://github.com/openzfs/zfs/pull/12863
Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
---
did not reproduce the dead-lock myself (in my very limited tests)
but given that our 2.1.2 packages (and kernel modules) have not seen
too much testing in public yet - thought it makes sense to proactively
pull this in
quickly tested a kernel+userspace with this patch on a physical testhost
.../0012-Fix-zvol_open-lock-inversion.patch | 212 ++++++++++++++++++
debian/patches/series | 1 +
2 files changed, 213 insertions(+)
create mode 100644 debian/patches/0012-Fix-zvol_open-lock-inversion.patch
diff --git a/debian/patches/0012-Fix-zvol_open-lock-inversion.patch b/debian/patches/0012-Fix-zvol_open-lock-inversion.patch
new file mode 100644
index 00000000..eb74550f
--- /dev/null
+++ b/debian/patches/0012-Fix-zvol_open-lock-inversion.patch
@@ -0,0 +1,212 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1 at llnl.gov>
+Date: Fri, 17 Dec 2021 09:52:13 -0800
+Subject: [PATCH] Fix zvol_open() lock inversion
+
+When restructuring the zvol_open() logic for the Linux 5.13 kernel
+a lock inversion was accidentally introduced. In the updated code
+the spa_namespace_lock is now taken before the zv_suspend_lock
+allowing the following scenario to occur:
+
+ down_read <=== waiting for zv_suspend_lock
+ zvol_open <=== holds spa_namespace_lock
+ __blkdev_get
+ blkdev_get_by_dev
+ blkdev_open
+ ...
+
+ mutex_lock <== waiting for spa_namespace_lock
+ spa_open_common
+ spa_open
+ dsl_pool_hold
+ dmu_objset_hold_flags
+ dmu_objset_hold
+ dsl_prop_get
+ dsl_prop_get_integer
+ zvol_create_minor
+ dmu_recv_end
+ zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
+ zfs_ioc_recv
+ ...
+
+This commit resolves the issue by moving the acquisition of the
+spa_namespace_lock back to after the zv_suspend_lock which restores
+the original ordering.
+
+Additionally, as part of this change the error exit paths were
+simplified where possible.
+
+Reviewed-by: Tony Hutter <hutter2 at llnl.gov>
+Reviewed-by: Rich Ercolani <rincebrain at gmail.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Closes #12863
+(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023)
+Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
+---
+ module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------
+ 1 file changed, 58 insertions(+), 63 deletions(-)
+
+diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
+index 44caadd58..69479b3f7 100644
+--- a/module/os/linux/zfs/zvol_os.c
++++ b/module/os/linux/zfs/zvol_os.c
+@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+ zvol_state_t *zv;
+ int error = 0;
+- boolean_t drop_suspend = B_TRUE;
+- boolean_t drop_namespace = B_FALSE;
++ boolean_t drop_suspend = B_FALSE;
+ #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+ hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+ hrtime_t start = gethrtime();
+@@ -517,7 +516,36 @@ retry:
+ return (SET_ERROR(-ENXIO));
+ }
+
+- if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
++ mutex_enter(&zv->zv_state_lock);
++ /*
++ * Make sure zvol is not suspended during first open
++ * (hold zv_suspend_lock) and respect proper lock acquisition
++ * ordering - zv_suspend_lock before zv_state_lock
++ */
++ if (zv->zv_open_count == 0) {
++ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
++ mutex_exit(&zv->zv_state_lock);
++ rw_enter(&zv->zv_suspend_lock, RW_READER);
++ mutex_enter(&zv->zv_state_lock);
++ /* check to see if zv_suspend_lock is needed */
++ if (zv->zv_open_count != 0) {
++ rw_exit(&zv->zv_suspend_lock);
++ } else {
++ drop_suspend = B_TRUE;
++ }
++ } else {
++ drop_suspend = B_TRUE;
++ }
++ }
++ rw_exit(&zvol_state_lock);
++
++ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++
++ if (zv->zv_open_count == 0) {
++ boolean_t drop_namespace = B_FALSE;
++
++ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
++
+ /*
+ * In all other call paths the spa_namespace_lock is taken
+ * before the bdev->bd_mutex lock. However, on open(2)
+@@ -542,84 +570,51 @@ retry:
+ * the kernel so the only option is to return the error for
+ * the caller to handle it.
+ */
+- if (!mutex_tryenter(&spa_namespace_lock)) {
+- rw_exit(&zvol_state_lock);
++ if (!mutex_owned(&spa_namespace_lock)) {
++ if (!mutex_tryenter(&spa_namespace_lock)) {
++ mutex_exit(&zv->zv_state_lock);
++ rw_exit(&zv->zv_suspend_lock);
+
+ #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+- schedule();
+- return (SET_ERROR(-ERESTARTSYS));
+-#else
+- if ((gethrtime() - start) > timeout)
++ schedule();
+ return (SET_ERROR(-ERESTARTSYS));
++#else
++ if ((gethrtime() - start) > timeout)
++ return (SET_ERROR(-ERESTARTSYS));
+
+- schedule_timeout(MSEC_TO_TICK(10));
+- goto retry;
++ schedule_timeout(MSEC_TO_TICK(10));
++ goto retry;
+ #endif
+- } else {
+- drop_namespace = B_TRUE;
+- }
+- }
+-
+- mutex_enter(&zv->zv_state_lock);
+- /*
+- * make sure zvol is not suspended during first open
+- * (hold zv_suspend_lock) and respect proper lock acquisition
+- * ordering - zv_suspend_lock before zv_state_lock
+- */
+- if (zv->zv_open_count == 0) {
+- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+- mutex_exit(&zv->zv_state_lock);
+- rw_enter(&zv->zv_suspend_lock, RW_READER);
+- mutex_enter(&zv->zv_state_lock);
+- /* check to see if zv_suspend_lock is needed */
+- if (zv->zv_open_count != 0) {
+- rw_exit(&zv->zv_suspend_lock);
+- drop_suspend = B_FALSE;
++ } else {
++ drop_namespace = B_TRUE;
+ }
+ }
+- } else {
+- drop_suspend = B_FALSE;
+- }
+- rw_exit(&zvol_state_lock);
+-
+- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+- if (zv->zv_open_count == 0) {
+- ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+- if (error)
+- goto out_mutex;
+- }
+
+- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+- error = -EROFS;
+- goto out_open_count;
++ if (drop_namespace)
++ mutex_exit(&spa_namespace_lock);
+ }
+
+- zv->zv_open_count++;
+-
+- mutex_exit(&zv->zv_state_lock);
+- if (drop_namespace)
+- mutex_exit(&spa_namespace_lock);
+- if (drop_suspend)
+- rw_exit(&zv->zv_suspend_lock);
+-
+- zfs_check_media_change(bdev);
+-
+- return (0);
++ if (error == 0) {
++ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
++ if (zv->zv_open_count == 0)
++ zvol_last_close(zv);
+
+-out_open_count:
+- if (zv->zv_open_count == 0)
+- zvol_last_close(zv);
++ error = SET_ERROR(-EROFS);
++ } else {
++ zv->zv_open_count++;
++ }
++ }
+
+-out_mutex:
+ mutex_exit(&zv->zv_state_lock);
+- if (drop_namespace)
+- mutex_exit(&spa_namespace_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
+- return (SET_ERROR(error));
++ if (error == 0)
++ zfs_check_media_change(bdev);
++
++ return (error);
+ }
+
+ static void
diff --git a/debian/patches/series b/debian/patches/series
index d2770d39..8166db91 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -9,3 +9,4 @@
0009-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
0010-arcstat-Fix-integer-division-with-python3.patch
0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
+0012-Fix-zvol_open-lock-inversion.patch
--
2.30.2
More information about the pve-devel
mailing list