[pve-devel] [RFC PATCH zfsonlinux] Add backported rename/locking fixes
Fabian Grünbichler
f.gruenbichler at proxmox.com
Mon Mar 21 16:48:44 CET 2016
backport three upstream commits related to zvol minor
operations, concurrency and locking. These should hopefully
fix the deadlock occuring on parallel zvol_open and rename
operations.
---
Tested with various parallel create, rename, destroy and
snapshot & rename commands in a loop. systemd-udevd itself
seems to also have a problem when rapidly creating and
destroying device nodes, but with these patches applied
the zfs side seems fixed (no more hanging zpools!).
Waiting for the device node to appear (/dev/zvol/<pool>/<vol>)
seems to work better than using udevadm settle/trigger.
Note: more sets of eyes and more testing will be required
before actually applying this!
...1-Fix-lock-order-inversion-with-zvol_open.patch | 111 ++
...Make-zvol-minor-functionality-more-robust.patch | 278 +++++
...rt-for-asynchronous-zvol-minor-operations.patch | 1293 ++++++++++++++++++++
zfs-patches/series | 3 +
4 files changed, 1685 insertions(+)
create mode 100644 zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch
create mode 100644 zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch
create mode 100644 zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch
diff --git a/zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch b/zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch
new file mode 100644
index 0000000..5ae1f3a
--- /dev/null
+++ b/zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch
@@ -0,0 +1,111 @@
+From 1ee159f423b5eb3c4646b0ba2dd0fb359503ba90 Mon Sep 17 00:00:00 2001
+From: Boris Protopopov <boris.protopopov at actifio.com>
+Date: Wed, 23 Sep 2015 12:34:51 -0400
+Subject: [PATCH] Fix lock order inversion with zvol_open()
+
+zfsonlinux issue #3681 - lock order inversion between zvol_open() and
+dsl_pool_sync()...zvol_rename_minors()
+
+Remove trylock of spa_namespace_lock as it is no longer needed when
+zvol minor operations are performed in a separate context with no
+prior locking state; the spa_namespace_lock is no longer held
+when bdev->bd_mutex or zfs_state_lock might be taken in the code
+paths originating from the zvol minor operation callbacks.
+
+Signed-off-by: Boris Protopopov <boris.protopopov at actifio.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Closes #3681
+
+Backported-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ module/zfs/zvol.c | 57 ++++++++++++++++---------------------------------------
+ 1 file changed, 16 insertions(+), 41 deletions(-)
+
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index 613b47e..6996242 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -874,54 +874,27 @@ zvol_first_open(zvol_state_t *zv)
+ {
+ objset_t *os;
+ uint64_t volsize;
+- int locked = 0;
+ int error;
+ uint64_t ro;
+
+- /*
+- * In all other cases the spa_namespace_lock is taken before the
+- * bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
+- * function calls fops->open() with the bdev->bd_mutex lock held.
+- *
+- * To avoid a potential lock inversion deadlock we preemptively
+- * try to take the spa_namespace_lock(). Normally it will not
+- * be contended and this is safe because spa_open_common() handles
+- * the case where the caller already holds the spa_namespace_lock.
+- *
+- * When it is contended we risk a lock inversion if we were to
+- * block waiting for the lock. Luckily, the __blkdev_get()
+- * function allows us to return -ERESTARTSYS which will result in
+- * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+- * called again. This process can be repeated safely until both
+- * locks are acquired.
+- */
+- if (!mutex_owned(&spa_namespace_lock)) {
+- locked = mutex_tryenter(&spa_namespace_lock);
+- if (!locked)
+- return (-SET_ERROR(ERESTARTSYS));
+- }
+-
+- error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
+- if (error)
+- goto out_mutex;
+-
+ /* lie and say we're read-only */
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
+ if (error)
+- goto out_mutex;
++ return (SET_ERROR(-error));
++
++ zv->zv_objset = os;
++
++ error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
++ if (error)
++ goto out_owned;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+- if (error) {
+- dmu_objset_disown(os, zvol_tag);
+- goto out_mutex;
+- }
++ if (error)
++ goto out_owned;
+
+- zv->zv_objset = os;
+ error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+- if (error) {
+- dmu_objset_disown(os, zvol_tag);
+- goto out_mutex;
+- }
++ if (error)
++ goto out_owned;
+
+ set_capacity(zv->zv_disk, volsize >> 9);
+ zv->zv_volsize = volsize;
+@@ -936,9 +909,11 @@ zvol_first_open(zvol_state_t *zv)
+ zv->zv_flags &= ~ZVOL_RDONLY;
+ }
+
+-out_mutex:
+- if (locked)
+- mutex_exit(&spa_namespace_lock);
++out_owned:
++ if (error) {
++ dmu_objset_disown(os, zvol_tag);
++ zv->zv_objset = NULL;
++ }
+
+ return (SET_ERROR(-error));
+ }
+--
+2.1.4
+
diff --git a/zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch b/zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch
new file mode 100644
index 0000000..1861b73
--- /dev/null
+++ b/zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch
@@ -0,0 +1,278 @@
+From 5428dc51fb55145fbac1c142402dafc11d1e7d28 Mon Sep 17 00:00:00 2001
+From: Boris Protopopov <boris.protopopov at actifio.com>
+Date: Tue, 16 Feb 2016 14:52:55 -0500
+Subject: [PATCH] Make zvol minor functionality more robust
+
+Close the race window in zvol_open() to prevent removal of
+zvol_state in the 'first open' code path. Move the call to
+check_disk_change() under zvol_state_lock to make sure the
+zvol_media_changed() and zvol_revalidate_disk() called by
+check_disk_change() are invoked with positive zv_open_count.
+
+Skip opened zvols when removing minors and set private_data
+to NULL for zvols that are not in use whose minors are being
+removed, to indicate to zvol_open() that the state is gone.
+Skip opened zvols when renaming minors to avoid modifying
+zv_name that might be in use, e.g. in zvol_ioctl().
+
+Drop zvol_state_lock before calling add_disk() when creating
+minors to avoid deadlocks with zvol_open().
+
+Wrap dmu_objset_find() with spl_fstran_mark()/unmark().
+
+Signed-off-by: Boris Protopopov <boris.protopopov at actifio.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Signed-off-by: Richard Yao <ryao at gentoo.org>
+Closes #4344
+
+Backported-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ module/zfs/zvol.c | 94 +++++++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 77 insertions(+), 17 deletions(-)
+
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index 6996242..8b0be3f 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -33,6 +33,8 @@
+ *
+ * Volumes are persistent through reboot and module load. No user command
+ * needs to be run before opening and using a device.
++ *
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #include <sys/dbuf.h>
+@@ -599,6 +601,8 @@ zvol_write(struct bio *bio)
+ dmu_tx_t *tx;
+ rl_t *rl;
+
++ ASSERT(zv && zv->zv_open_count > 0);
++
+ if (bio->bi_rw & VDEV_REQ_FLUSH)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+@@ -647,6 +651,8 @@ zvol_discard(struct bio *bio)
+ int error;
+ rl_t *rl;
+
++ ASSERT(zv && zv->zv_open_count > 0);
++
+ if (end > zv->zv_volsize)
+ return (SET_ERROR(EIO));
+
+@@ -693,7 +699,7 @@ zvol_read(struct bio *bio)
+
+ if (len == 0)
+ return (0);
+-
++ ASSERT(zv && zv->zv_open_count > 0);
+
+ rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
+
+@@ -942,7 +948,7 @@ zvol_last_close(zvol_state_t *zv)
+ static int
+ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+- zvol_state_t *zv = bdev->bd_disk->private_data;
++ zvol_state_t *zv;
+ int error = 0, drop_mutex = 0;
+
+ /*
+@@ -956,7 +962,17 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ drop_mutex = 1;
+ }
+
+- ASSERT3P(zv, !=, NULL);
++ /*
++ * Obtain a copy of private_data under the lock to make sure
++ * that either the result of zvol_freeg() setting
++ * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
++ * is not called on this zv because of the positive zv_open_count.
++ */
++ zv = bdev->bd_disk->private_data;
++ if (zv == NULL) {
++ error = -ENXIO;
++ goto out_mutex;
++ }
+
+ if (zv->zv_open_count == 0) {
+ error = zvol_first_open(zv);
+@@ -971,6 +987,8 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+
+ zv->zv_open_count++;
+
++ check_disk_change(bdev);
++
+ out_open_count:
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);
+@@ -979,8 +997,6 @@ out_mutex:
+ if (drop_mutex)
+ mutex_exit(&zvol_state_lock);
+
+- check_disk_change(bdev);
+-
+ return (SET_ERROR(error));
+ }
+
+@@ -994,16 +1010,16 @@ zvol_release(struct gendisk *disk, fmode_t mode)
+ zvol_state_t *zv = disk->private_data;
+ int drop_mutex = 0;
+
++ ASSERT(zv && zv->zv_open_count > 0);
++
+ if (!mutex_owned(&zvol_state_lock)) {
+ mutex_enter(&zvol_state_lock);
+ drop_mutex = 1;
+ }
+
+- if (zv->zv_open_count > 0) {
+- zv->zv_open_count--;
+- if (zv->zv_open_count == 0)
+- zvol_last_close(zv);
+- }
++ zv->zv_open_count--;
++ if (zv->zv_open_count == 0)
++ zvol_last_close(zv);
+
+ if (drop_mutex)
+ mutex_exit(&zvol_state_lock);
+@@ -1020,8 +1036,7 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
+ zvol_state_t *zv = bdev->bd_disk->private_data;
+ int error = 0;
+
+- if (zv == NULL)
+- return (SET_ERROR(-ENXIO));
++ ASSERT(zv && zv->zv_open_count > 0);
+
+ switch (cmd) {
+ case BLKFLSBUF:
+@@ -1055,6 +1070,8 @@ static int zvol_media_changed(struct gendisk *disk)
+ {
+ zvol_state_t *zv = disk->private_data;
+
++ ASSERT(zv && zv->zv_open_count > 0);
++
+ return (zv->zv_changed);
+ }
+
+@@ -1062,6 +1079,8 @@ static int zvol_revalidate_disk(struct gendisk *disk)
+ {
+ zvol_state_t *zv = disk->private_data;
+
++ ASSERT(zv && zv->zv_open_count > 0);
++
+ zv->zv_changed = 0;
+ set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
+
+@@ -1078,7 +1097,11 @@ static int
+ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+ {
+ zvol_state_t *zv = bdev->bd_disk->private_data;
+- sector_t sectors = get_capacity(zv->zv_disk);
++ sector_t sectors;
++
++ ASSERT(zv && zv->zv_open_count > 0);
++
++ sectors = get_capacity(zv->zv_disk);
+
+ if (sectors > 2048) {
+ geo->heads = 16;
+@@ -1235,9 +1258,14 @@ out_kmem:
+ static void
+ zvol_free(zvol_state_t *zv)
+ {
++ ASSERT(MUTEX_HELD(&zvol_state_lock));
++ ASSERT(zv->zv_open_count == 0);
++
+ avl_destroy(&zv->zv_znode.z_range_avl);
+ mutex_destroy(&zv->zv_znode.z_range_lock);
+
++ zv->zv_disk->private_data = NULL;
++
+ del_gendisk(zv->zv_disk);
+ blk_cleanup_queue(zv->zv_queue);
+ put_disk(zv->zv_disk);
+@@ -1370,7 +1398,15 @@ out:
+
+ if (error == 0) {
+ zvol_insert(zv);
++ /*
++ * Drop the lock to prevent deadlock with sys_open() ->
++ * zvol_open(), which first takes bd_disk->bd_mutex and then
++ * takes zvol_state_lock, whereas this code path first takes
++ * zvol_state_lock, and then takes bd_disk->bd_mutex.
++ */
++ mutex_exit(&zvol_state_lock);
+ add_disk(zv->zv_disk);
++ mutex_enter(&zvol_state_lock);
+ }
+
+ return (SET_ERROR(error));
+@@ -1467,10 +1503,15 @@ int
+ zvol_create_minors(const char *name)
+ {
+ int error = 0;
++ fstrans_cookie_t cookie;
+
+- if (!zvol_inhibit_dev)
+- error = dmu_objset_find((char *)name, zvol_create_minors_cb,
+- NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
++ if (zvol_inhibit_dev)
++ return (0);
++
++ cookie = spl_fstrans_mark();
++ error = dmu_objset_find((char *)name, zvol_create_minors_cb,
++ NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
++ spl_fstrans_unmark(cookie);
+
+ return (SET_ERROR(error));
+ }
+@@ -1494,7 +1535,13 @@ zvol_remove_minors(const char *name)
+
+ if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
+ (strncmp(zv->zv_name, name, namelen) == 0 &&
+- zv->zv_name[namelen] == '/')) {
++ (zv->zv_name[namelen] == '/' ||
++ zv->zv_name[namelen] == '@'))) {
++
++ /* If in use, leave alone */
++ if (zv->zv_open_count > 0)
++ continue;
++
+ zvol_remove(zv);
+ zvol_free(zv);
+ }
+@@ -1525,6 +1572,10 @@ zvol_rename_minors(const char *oldname, const char *newname)
+ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+ zv_next = list_next(&zvol_state_list, zv);
+
++ /* If in use, leave alone */
++ if (zv->zv_open_count > 0)
++ continue;
++
+ if (strcmp(zv->zv_name, oldname) == 0) {
+ __zvol_rename_minor(zv, newname);
+ } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+@@ -1565,8 +1616,17 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
+
+ int
+ zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
++ fstrans_cookie_t cookie;
++
++ if (zvol_inhibit_dev)
++ /* caller should continue to modify snapdev property */
++ return (-1);
++
++ cookie = spl_fstrans_mark();
+ (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
+ &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
++ spl_fstrans_unmark(cookie);
++
+ /* caller should continue to modify snapdev property */
+ return (-1);
+ }
+--
+2.1.4
+
diff --git a/zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch b/zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch
new file mode 100644
index 0000000..be97aab
--- /dev/null
+++ b/zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch
@@ -0,0 +1,1293 @@
+From a0bd735adb1b1eb81fef10b4db102ee051c4d4ff Mon Sep 17 00:00:00 2001
+From: Boris Protopopov <boris.protopopov at actifio.com>
+Date: Sat, 22 Mar 2014 05:07:14 -0400
+Subject: [PATCH] Add support for asynchronous zvol minor operations
+
+zfsonlinux issue #2217 - zvol minor operations: check snapdev
+property before traversing snapshots of a dataset
+
+zfsonlinux issue #3681 - lock order inversion between zvol_open()
+and dsl_pool_sync()...zvol_rename_minors()
+
+Create a per-pool zvol taskq for asynchronous zvol tasks.
+There are a few key design decisions to be aware of.
+
+* Each taskq must be single threaded to ensure tasks are always
+ processed in the order in which they were dispatched.
+
+* There is a taskq per-pool in order to keep the pools independent.
+ This way if one pool is suspended it will not impact another.
+
+* The preferred location to dispatch a zvol minor task is a sync
+ task. In this context there is easy access to the spa_t and
+ minimal error handling is required because the sync task must
+ succeed.
+
+Support for asynchronous zvol minor operations address issue #3681.
+
+Signed-off-by: Boris Protopopov <boris.protopopov at actifio.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Closes #2217
+Closes #3678
+Closes #3681
+
+Backported-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ include/sys/spa_impl.h | 2 +
+ include/sys/zvol.h | 15 +-
+ lib/libzpool/kernel.c | 22 +++
+ module/zfs/dmu_objset.c | 4 +
+ module/zfs/dmu_send.c | 3 +
+ module/zfs/dsl_dataset.c | 36 +---
+ module/zfs/dsl_destroy.c | 8 +-
+ module/zfs/dsl_dir.c | 6 +-
+ module/zfs/spa.c | 37 +++-
+ module/zfs/zfs_ioctl.c | 50 ++---
+ module/zfs/zvol.c | 493 +++++++++++++++++++++++++++++++++++------------
+ scripts/zconfig.sh | 19 +-
+ 12 files changed, 481 insertions(+), 214 deletions(-)
+
+diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
+index 0b49c71..5176eb8 100644
+--- a/include/sys/spa_impl.h
++++ b/include/sys/spa_impl.h
+@@ -23,6 +23,7 @@
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #ifndef _SYS_SPA_IMPL_H
+@@ -252,6 +253,7 @@ struct spa {
+ uint64_t spa_deadman_synctime; /* deadman expiration timer */
+ uint64_t spa_errata; /* errata issues detected */
+ spa_stats_t spa_stats; /* assorted spa statistics */
++ taskq_t *spa_zvol_taskq; /* Taskq for minor managment */
+
+ /*
+ * spa_refcount & spa_config_lock must be the last elements
+diff --git a/include/sys/zvol.h b/include/sys/zvol.h
+index 898e235..c3e386f 100644
+--- a/include/sys/zvol.h
++++ b/include/sys/zvol.h
+@@ -21,6 +21,7 @@
+
+ /*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #ifndef _SYS_ZVOL_H
+@@ -31,24 +32,22 @@
+ #define ZVOL_OBJ 1ULL
+ #define ZVOL_ZAP_OBJ 2ULL
+
+-#ifdef _KERNEL
++extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async);
++extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async);
++extern void zvol_rename_minors(spa_t *spa, const char *oldname,
++ const char *newname, boolean_t async);
+
++#ifdef _KERNEL
+ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
+ extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize);
+ extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
+ extern boolean_t zvol_is_zvol(const char *);
+ extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+-extern int zvol_create_minor(const char *name);
+-extern int zvol_create_minors(const char *name);
+-extern int zvol_remove_minor(const char *name);
+-extern void zvol_remove_minors(const char *name);
+-extern void zvol_rename_minors(const char *oldname, const char *newname);
+ extern int zvol_set_volsize(const char *, uint64_t);
+ extern int zvol_set_volblocksize(const char *, uint64_t);
+-extern int zvol_set_snapdev(const char *, uint64_t);
++extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t);
+
+ extern int zvol_init(void);
+ extern void zvol_fini(void);
+-
+ #endif /* _KERNEL */
+ #endif /* _SYS_ZVOL_H */
+diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
+index a451026..f1ce8b3 100644
+--- a/lib/libzpool/kernel.c
++++ b/lib/libzpool/kernel.c
+@@ -20,6 +20,7 @@
+ */
+ /*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #include <assert.h>
+@@ -1322,3 +1323,24 @@ spl_fstrans_check(void)
+ {
+ return (0);
+ }
++
++void
++zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
++{
++}
++
++void
++zvol_remove_minor(spa_t *spa, const char *name, boolean_t async)
++{
++}
++
++void
++zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
++{
++}
++
++void
++zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
++ boolean_t async)
++{
++}
+diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
+index 779b3bb..f2d492e 100644
+--- a/module/zfs/dmu_objset.c
++++ b/module/zfs/dmu_objset.c
+@@ -26,6 +26,7 @@
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ /* Portions Copyright 2010 Robert Milkowski */
+@@ -900,6 +901,8 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
+ }
+
+ spa_history_log_internal_ds(ds, "create", tx, "");
++ zvol_create_minors(dp->dp_spa, doca->doca_name, B_TRUE);
++
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+ }
+@@ -993,6 +996,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
+ dsl_dataset_name(origin, namebuf);
+ spa_history_log_internal_ds(ds, "clone", tx,
+ "origin=%s (%llu)", namebuf, origin->ds_object);
++ zvol_create_minors(dp->dp_spa, doca->doca_clone, B_TRUE);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(origin, FTAG);
+ dsl_dir_rele(pdd, FTAG);
+diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
+index b2d844e..6a349c6 100644
+--- a/module/zfs/dmu_send.c
++++ b/module/zfs/dmu_send.c
+@@ -24,6 +24,7 @@
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #include <sys/dmu.h>
+@@ -53,6 +54,7 @@
+ #include <sys/blkptr.h>
+ #include <sys/dsl_bookmark.h>
+ #include <sys/zfeature.h>
++#include <sys/zvol.h>
+
+ /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+ int zfs_send_corrupt_data = B_FALSE;
+@@ -2162,6 +2164,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
+ dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ }
+ drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
++ zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
+ /*
+ * Release the hold from dmu_recv_begin. This must be done before
+ * we return to open context, so that when we free the dataset's dnode,
+diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
+index 2168f28..a063885 100644
+--- a/module/zfs/dsl_dataset.c
++++ b/module/zfs/dsl_dataset.c
+@@ -24,6 +24,7 @@
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #include <sys/dmu_objset.h>
+@@ -1376,6 +1377,7 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
+ dsl_props_set_sync_impl(ds->ds_prev,
+ ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+ }
++ zvol_create_minors(dp->dp_spa, nvpair_name(pair), B_TRUE);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ }
+@@ -1450,16 +1452,6 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+ fnvlist_free(suspended);
+ }
+
+-#ifdef _KERNEL
+- if (error == 0) {
+- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+- pair = nvlist_next_nvpair(snaps, pair)) {
+- char *snapname = nvpair_name(pair);
+- zvol_create_minors(snapname);
+- }
+- }
+-#endif
+-
+ return (error);
+ }
+
+@@ -1870,6 +1862,8 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+ VERIFY0(zap_add(dp->dp_meta_objset,
+ dsl_dataset_phys(hds)->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &ds->ds_object, tx));
++ zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname,
++ ddrsa->ddrsa_newsnapname, B_TRUE);
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+@@ -1898,11 +1892,6 @@ int
+ dsl_dataset_rename_snapshot(const char *fsname,
+ const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+ {
+-#ifdef _KERNEL
+- char *oldname, *newname;
+-#endif
+- int error;
+-
+ dsl_dataset_rename_snapshot_arg_t ddrsa;
+
+ ddrsa.ddrsa_fsname = fsname;
+@@ -1910,22 +1899,9 @@ dsl_dataset_rename_snapshot(const char *fsname,
+ ddrsa.ddrsa_newsnapname = newsnapname;
+ ddrsa.ddrsa_recursive = recursive;
+
+- error = dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
++ return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+ dsl_dataset_rename_snapshot_sync, &ddrsa,
+- 1, ZFS_SPACE_CHECK_RESERVED);
+-
+- if (error)
+- return (SET_ERROR(error));
+-
+-#ifdef _KERNEL
+- oldname = kmem_asprintf("%s@%s", fsname, oldsnapname);
+- newname = kmem_asprintf("%s@%s", fsname, newsnapname);
+- zvol_rename_minors(oldname, newname);
+- strfree(newname);
+- strfree(oldname);
+-#endif
+-
+- return (0);
++ 1, ZFS_SPACE_CHECK_RESERVED));
+ }
+
+ /*
+diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
+index 0e2238f..34d076e 100644
+--- a/module/zfs/dsl_destroy.c
++++ b/module/zfs/dsl_destroy.c
+@@ -23,6 +23,7 @@
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #include <sys/zfs_context.h>
+@@ -40,6 +41,7 @@
+ #include <sys/zfs_ioctl.h>
+ #include <sys/dsl_deleg.h>
+ #include <sys/dmu_impl.h>
++#include <sys/zvol.h>
+
+ typedef struct dmu_snapshots_destroy_arg {
+ nvlist_t *dsda_snaps;
+@@ -243,9 +245,6 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+ void
+ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+ {
+-#ifdef ZFS_DEBUG
+- int err;
+-#endif
+ int after_branch_point = FALSE;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+@@ -438,6 +437,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+ #ifdef ZFS_DEBUG
+ {
+ uint64_t val;
++ int err;
+
+ err = dsl_dataset_snap_lookup(ds_head,
+ ds->ds_snapname, &val);
+@@ -487,6 +487,7 @@ dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+ VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+
+ dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
++ zvol_remove_minors(dp->dp_spa, nvpair_name(pair), B_TRUE);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ }
+@@ -881,6 +882,7 @@ dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+
+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+ dsl_destroy_head_sync_impl(ds, tx);
++ zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
+index ba6c244..762e2e5 100644
+--- a/module/zfs/dsl_dir.c
++++ b/module/zfs/dsl_dir.c
+@@ -24,6 +24,7 @@
+ * Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ #include <sys/dmu.h>
+@@ -1913,9 +1914,8 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
+ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
+ dd->dd_myname, 8, 1, &dd->dd_object, tx));
+
+-#ifdef _KERNEL
+- zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
+-#endif
++ zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
++ ddra->ddra_newname, B_TRUE);
+
+ dsl_prop_notify_all(dd);
+
+diff --git a/module/zfs/spa.c b/module/zfs/spa.c
+index b4831a7..4468cdf 100644
+--- a/module/zfs/spa.c
++++ b/module/zfs/spa.c
+@@ -24,6 +24,7 @@
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ /*
+@@ -1131,6 +1132,24 @@ spa_activate(spa_t *spa, int mode)
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
++
++ /*
++ * This taskq is used to perform zvol-minor-related tasks
++ * asynchronously. This has several advantages, including easy
++ * resolution of various deadlocks (zfsonlinux bug #3681).
++ *
++ * The taskq must be single threaded to ensure tasks are always
++ * processed in the order in which they were dispatched.
++ *
++ * A taskq per pool allows one to keep the pools independent.
++ * This way if one pool is suspended, it will not impact another.
++ *
++ * The preferred location to dispatch a zvol minor task is a sync
++ * task. In this context, there is easy access to the spa_t and minimal
++ * error handling is required because the sync task must succeed.
++ */
++ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
++ 1, INT_MAX, 0);
+ }
+
+ /*
+@@ -1149,6 +1168,11 @@ spa_deactivate(spa_t *spa)
+
+ spa_evicting_os_wait(spa);
+
++ if (spa->spa_zvol_taskq) {
++ taskq_destroy(spa->spa_zvol_taskq);
++ spa->spa_zvol_taskq = NULL;
++ }
++
+ txg_list_destroy(&spa->spa_vdev_txg_list);
+
+ list_destroy(&spa->spa_config_dirty_list);
+@@ -3083,10 +3107,8 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+ mutex_exit(&spa_namespace_lock);
+ }
+
+-#ifdef _KERNEL
+ if (firstopen)
+- zvol_create_minors(spa->spa_name);
+-#endif
++ zvol_create_minors(spa, spa_name(spa), B_TRUE);
+
+ *spapp = spa;
+
+@@ -4206,10 +4228,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
+
+ mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, "import");
+-
+-#ifdef _KERNEL
+- zvol_create_minors(pool);
+-#endif
++ zvol_create_minors(spa, pool, B_TRUE);
+
+ return (0);
+ }
+@@ -4344,6 +4363,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
++ if (spa->spa_zvol_taskq) {
++ zvol_remove_minors(spa, spa_name(spa), B_TRUE);
++ taskq_wait(spa->spa_zvol_taskq);
++ }
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
+index d026e92..12d2750 100644
+--- a/module/zfs/zfs_ioctl.c
++++ b/module/zfs/zfs_ioctl.c
+@@ -30,6 +30,7 @@
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+ /*
+@@ -1500,8 +1501,7 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+ int error;
+ zfs_log_history(zc);
+ error = spa_destroy(zc->zc_name);
+- if (error == 0)
+- zvol_remove_minors(zc->zc_name);
++
+ return (error);
+ }
+
+@@ -1553,8 +1553,7 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
+
+ zfs_log_history(zc);
+ error = spa_export(zc->zc_name, NULL, force, hardforce);
+- if (error == 0)
+- zvol_remove_minors(zc->zc_name);
++
+ return (error);
+ }
+
+@@ -2395,7 +2394,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
+ err = zvol_set_volsize(dsname, intval);
+ break;
+ case ZFS_PROP_SNAPDEV:
+- err = zvol_set_snapdev(dsname, intval);
++ err = zvol_set_snapdev(dsname, source, intval);
+ break;
+ case ZFS_PROP_VERSION:
+ {
+@@ -3189,12 +3188,6 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+ if (error != 0)
+ (void) dsl_destroy_head(fsname);
+ }
+-
+-#ifdef _KERNEL
+- if (error == 0 && type == DMU_OST_ZVOL)
+- zvol_create_minors(fsname);
+-#endif
+-
+ return (error);
+ }
+
+@@ -3237,12 +3230,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+ if (error != 0)
+ (void) dsl_destroy_head(fsname);
+ }
+-
+-#ifdef _KERNEL
+- if (error == 0)
+- zvol_create_minors(fsname);
+-#endif
+-
+ return (error);
+ }
+
+@@ -3305,11 +3292,6 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+
+ error = dsl_dataset_snapshot(snaps, props, outnvl);
+
+-#ifdef _KERNEL
+- if (error == 0)
+- zvol_create_minors(poolname);
+-#endif
+-
+ return (error);
+ }
+
+@@ -3435,7 +3417,6 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(snaps, pair)) {
+ (void) zfs_unmount_snap(nvpair_name(pair));
+- (void) zvol_remove_minor(nvpair_name(pair));
+ }
+
+ return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
+@@ -3561,8 +3542,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
+ err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
+ else
+ err = dsl_destroy_head(zc->zc_name);
+- if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+- (void) zvol_remove_minor(zc->zc_name);
++
+ return (err);
+ }
+
+@@ -4128,11 +4108,6 @@ zfs_ioc_recv(zfs_cmd_t *zc)
+ }
+ #endif
+
+-#ifdef _KERNEL
+- if (error == 0)
+- zvol_create_minors(tofs);
+-#endif
+-
+ /*
+ * On error, restore the original props.
+ */
+@@ -6018,16 +5993,16 @@ _init(void)
+ return (error);
+ }
+
++ if ((error = -zvol_init()) != 0)
++ return (error);
++
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+
+- if ((error = -zvol_init()) != 0)
+- goto out1;
+-
+ zfs_ioctl_init();
+
+ if ((error = zfs_attach()) != 0)
+- goto out2;
++ goto out;
+
+ tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+@@ -6043,11 +6018,10 @@ _init(void)
+
+ return (0);
+
+-out2:
+- (void) zvol_fini();
+-out1:
++out:
+ zfs_fini();
+ spa_fini();
++ (void) zvol_fini();
+ printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s"
+ ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE,
+ ZFS_DEBUG_STR, error);
+@@ -6059,9 +6033,9 @@ static void __exit
+ _fini(void)
+ {
+ zfs_detach();
+- zvol_fini();
+ zfs_fini();
+ spa_fini();
++ zvol_fini();
+
+ tsd_destroy(&zfs_fsyncer_key);
+ tsd_destroy(&rrw_tsd_key);
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index 8b0be3f..6e33106 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -41,12 +41,15 @@
+ #include <sys/dmu_traverse.h>
+ #include <sys/dsl_dataset.h>
+ #include <sys/dsl_prop.h>
++#include <sys/dsl_dir.h>
+ #include <sys/zap.h>
+ #include <sys/zfeature.h>
+ #include <sys/zil_impl.h>
++#include <sys/dmu_tx.h>
+ #include <sys/zio.h>
+ #include <sys/zfs_rlock.h>
+ #include <sys/zfs_znode.h>
++#include <sys/spa_impl.h>
+ #include <sys/zvol.h>
+ #include <linux/blkdev_compat.h>
+
+@@ -80,6 +83,23 @@ typedef struct zvol_state {
+ list_node_t zv_next; /* next zvol_state_t linkage */
+ } zvol_state_t;
+
++typedef enum {
++ ZVOL_ASYNC_CREATE_MINORS,
++ ZVOL_ASYNC_REMOVE_MINORS,
++ ZVOL_ASYNC_RENAME_MINORS,
++ ZVOL_ASYNC_SET_SNAPDEV,
++ ZVOL_ASYNC_MAX
++} zvol_async_op_t;
++
++typedef struct {
++ zvol_async_op_t op;
++ char pool[MAXNAMELEN];
++ char name1[MAXNAMELEN];
++ char name2[MAXNAMELEN];
++ zprop_source_t source;
++ uint64_t snapdev;
++} zvol_task_t;
++
+ #define ZVOL_RDONLY 0x1
+
+ /*
+@@ -953,7 +973,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+
+ /*
+ * If the caller is already holding the mutex do not take it
+- * again, this will happen as part of zvol_create_minor().
++ * again, this will happen as part of zvol_create_minor_impl().
+ * Once add_disk() is called the device is live and the kernel
+ * will attempt to open it to read the partition information.
+ */
+@@ -1273,31 +1293,13 @@ zvol_free(zvol_state_t *zv)
+ kmem_free(zv, sizeof (zvol_state_t));
+ }
+
++/*
++ * Create a block device minor node and setup the linkage between it
++ * and the specified volume. Once this function returns the block
++ * device is live and ready for use.
++ */
+ static int
+-__zvol_snapdev_hidden(const char *name)
+-{
+- uint64_t snapdev;
+- char *parent;
+- char *atp;
+- int error = 0;
+-
+- parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+- (void) strlcpy(parent, name, MAXPATHLEN);
+-
+- if ((atp = strrchr(parent, '@')) != NULL) {
+- *atp = '\0';
+- error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
+- if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
+- error = SET_ERROR(ENODEV);
+- }
+-
+- kmem_free(parent, MAXPATHLEN);
+-
+- return (SET_ERROR(error));
+-}
+-
+-static int
+-__zvol_create_minor(const char *name, boolean_t ignore_snapdev)
++zvol_create_minor_impl(const char *name)
+ {
+ zvol_state_t *zv;
+ objset_t *os;
+@@ -1307,7 +1309,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
+ unsigned minor = 0;
+ int error = 0;
+
+- ASSERT(MUTEX_HELD(&zvol_state_lock));
++ mutex_enter(&zvol_state_lock);
+
+ zv = zvol_find_by_name(name);
+ if (zv) {
+@@ -1315,12 +1317,6 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
+ goto out;
+ }
+
+- if (ignore_snapdev == B_FALSE) {
+- error = __zvol_snapdev_hidden(name);
+- if (error)
+- goto out;
+- }
+-
+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
+@@ -1406,69 +1402,18 @@ out:
+ */
+ mutex_exit(&zvol_state_lock);
+ add_disk(zv->zv_disk);
+- mutex_enter(&zvol_state_lock);
++ } else {
++ mutex_exit(&zvol_state_lock);
+ }
+
+ return (SET_ERROR(error));
+ }
+
+ /*
+- * Create a block device minor node and setup the linkage between it
+- * and the specified volume. Once this function returns the block
+- * device is live and ready for use.
+- */
+-int
+-zvol_create_minor(const char *name)
+-{
+- int error;
+-
+- mutex_enter(&zvol_state_lock);
+- error = __zvol_create_minor(name, B_FALSE);
+- mutex_exit(&zvol_state_lock);
+-
+- return (SET_ERROR(error));
+-}
+-
+-static int
+-__zvol_remove_minor(const char *name)
+-{
+- zvol_state_t *zv;
+-
+- ASSERT(MUTEX_HELD(&zvol_state_lock));
+-
+- zv = zvol_find_by_name(name);
+- if (zv == NULL)
+- return (SET_ERROR(ENXIO));
+-
+- if (zv->zv_open_count > 0)
+- return (SET_ERROR(EBUSY));
+-
+- zvol_remove(zv);
+- zvol_free(zv);
+-
+- return (0);
+-}
+-
+-/*
+- * Remove a block device minor node for the specified volume.
+- */
+-int
+-zvol_remove_minor(const char *name)
+-{
+- int error;
+-
+- mutex_enter(&zvol_state_lock);
+- error = __zvol_remove_minor(name);
+- mutex_exit(&zvol_state_lock);
+-
+- return (SET_ERROR(error));
+-}
+-
+-/*
+ * Rename a block device minor mode for the specified volume.
+ */
+ static void
+-__zvol_rename_minor(zvol_state_t *zv, const char *newname)
++zvol_rename_minor(zvol_state_t *zv, const char *newname)
+ {
+ int readonly = get_disk_ro(zv->zv_disk);
+
+@@ -1488,30 +1433,120 @@ __zvol_rename_minor(zvol_state_t *zv, const char *newname)
+ set_disk_ro(zv->zv_disk, readonly);
+ }
+
++
++/*
++ * Mask errors to continue dmu_objset_find() traversal
++ */
++static int
++zvol_create_snap_minor_cb(const char *dsname, void *arg)
++{
++ const char *name = (const char *)arg;
++
++ /* skip the designated dataset */
++ if (name && strcmp(dsname, name) == 0)
++ return (0);
++
++ /* at this point, the dsname should name a snapshot */
++ if (strchr(dsname, '@') == 0) {
++ dprintf("zvol_create_snap_minor_cb(): "
++ "%s is not a shapshot name\n", dsname);
++ } else {
++ (void) zvol_create_minor_impl(dsname);
++ }
++
++ return (0);
++}
++
++/*
++ * Mask errors to continue dmu_objset_find() traversal
++ */
+ static int
+ zvol_create_minors_cb(const char *dsname, void *arg)
+ {
+- (void) zvol_create_minor(dsname);
++ uint64_t snapdev;
++ int error;
++
++ error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
++ if (error)
++ return (0);
++
++ /*
++ * Given the name and the 'snapdev' property, create device minor nodes
++ * with the linkages to zvols/snapshots as needed.
++ * If the name represents a zvol, create a minor node for the zvol, then
++ * check if its snapshots are 'visible', and if so, iterate over the
++ * snapshots and create device minor nodes for those.
++ */
++ if (strchr(dsname, '@') == 0) {
++ /* create minor for the 'dsname' explicitly */
++ error = zvol_create_minor_impl(dsname);
++ if ((error == 0 || error == EEXIST) &&
++ (snapdev == ZFS_SNAPDEV_VISIBLE)) {
++ fstrans_cookie_t cookie = spl_fstrans_mark();
++ /*
++ * traverse snapshots only, do not traverse children,
++ * and skip the 'dsname'
++ */
++ error = dmu_objset_find((char *)dsname,
++ zvol_create_snap_minor_cb, (void *)dsname,
++ DS_FIND_SNAPSHOTS);
++ spl_fstrans_unmark(cookie);
++ }
++ } else {
++ dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
++ dsname);
++ }
+
+ return (0);
+ }
+
+ /*
+- * Create minors for specified dataset including children and snapshots.
++ * Create minors for the specified dataset, including children and snapshots.
++ * Pay attention to the 'snapdev' property and iterate over the snapshots
++ * only if they are 'visible'. This approach allows one to assure that the
++ * snapshot metadata is read from disk only if it is needed.
++ *
++ * The name can represent a dataset to be recursively scanned for zvols and
++ * their snapshots, or a single zvol snapshot. If the name represents a
++ * dataset, the scan is performed in two nested stages:
++ * - scan the dataset for zvols, and
++ * - for each zvol, create a minor node, then check if the zvol's snapshots
++ * are 'visible', and only then iterate over the snapshots if needed
++ *
++ * If the name represents a snapshot, a check is perfromed if the snapshot is
++ * 'visible' (which also verifies that the parent is a zvol), and if so,
++ * a minor node for that snapshot is created.
+ */
+-int
+-zvol_create_minors(const char *name)
++static int
++zvol_create_minors_impl(const char *name)
+ {
+ int error = 0;
+ fstrans_cookie_t cookie;
++ char *atp, *parent;
+
+ if (zvol_inhibit_dev)
+ return (0);
+
+- cookie = spl_fstrans_mark();
+- error = dmu_objset_find((char *)name, zvol_create_minors_cb,
+- NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+- spl_fstrans_unmark(cookie);
++ parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
++ (void) strlcpy(parent, name, MAXPATHLEN);
++
++ if ((atp = strrchr(parent, '@')) != NULL) {
++ uint64_t snapdev;
++
++ *atp = '\0';
++ error = dsl_prop_get_integer(parent, "snapdev",
++ &snapdev, NULL);
++
++ if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
++ error = zvol_create_minor_impl(name);
++ } else {
++ cookie = spl_fstrans_mark();
++ error = dmu_objset_find(parent, zvol_create_minors_cb,
++ NULL, DS_FIND_CHILDREN);
++ spl_fstrans_unmark(cookie);
++ }
++
++ kmem_free(parent, MAXPATHLEN);
+
+ return (SET_ERROR(error));
+ }
+@@ -1519,8 +1554,8 @@ zvol_create_minors(const char *name)
+ /*
+ * Remove minors for specified dataset including children and snapshots.
+ */
+-void
+-zvol_remove_minors(const char *name)
++static void
++zvol_remove_minors_impl(const char *name)
+ {
+ zvol_state_t *zv, *zv_next;
+ int namelen = ((name) ? strlen(name) : 0);
+@@ -1550,11 +1585,41 @@ zvol_remove_minors(const char *name)
+ mutex_exit(&zvol_state_lock);
+ }
+
++/* Remove minor for this specific snapshot only */
++static void
++zvol_remove_minor_impl(const char *name)
++{
++ zvol_state_t *zv, *zv_next;
++
++ if (zvol_inhibit_dev)
++ return;
++
++ if (strchr(name, '@') == NULL)
++ return;
++
++ mutex_enter(&zvol_state_lock);
++
++ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
++ zv_next = list_next(&zvol_state_list, zv);
++
++ if (strcmp(zv->zv_name, name) == 0) {
++ /* If in use, leave alone */
++ if (zv->zv_open_count > 0)
++ continue;
++ zvol_remove(zv);
++ zvol_free(zv);
++ break;
++ }
++ }
++
++ mutex_exit(&zvol_state_lock);
++}
++
+ /*
+ * Rename minors for specified dataset including children and snapshots.
+ */
+-void
+-zvol_rename_minors(const char *oldname, const char *newname)
++static void
++zvol_rename_minors_impl(const char *oldname, const char *newname)
+ {
+ zvol_state_t *zv, *zv_next;
+ int oldnamelen, newnamelen;
+@@ -1577,14 +1642,14 @@ zvol_rename_minors(const char *oldname, const char *newname)
+ continue;
+
+ if (strcmp(zv->zv_name, oldname) == 0) {
+- __zvol_rename_minor(zv, newname);
++ zvol_rename_minor(zv, newname);
+ } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+ (zv->zv_name[oldnamelen] == '/' ||
+ zv->zv_name[oldnamelen] == '@')) {
+ snprintf(name, MAXNAMELEN, "%s%c%s", newname,
+ zv->zv_name[oldnamelen],
+ zv->zv_name + oldnamelen + 1);
+- __zvol_rename_minor(zv, name);
++ zvol_rename_minor(zv, name);
+ }
+ }
+
+@@ -1593,42 +1658,227 @@ zvol_rename_minors(const char *oldname, const char *newname)
+ kmem_free(name, MAXNAMELEN);
+ }
+
++typedef struct zvol_snapdev_cb_arg {
++ uint64_t snapdev;
++} zvol_snapdev_cb_arg_t;
++
+ static int
+-snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
+- uint64_t snapdev = *(uint64_t *) arg;
++zvol_set_snapdev_cb(const char *dsname, void *param) {
++ zvol_snapdev_cb_arg_t *arg = param;
+
+ if (strchr(dsname, '@') == NULL)
+ return (0);
+
+- switch (snapdev) {
++ switch (arg->snapdev) {
+ case ZFS_SNAPDEV_VISIBLE:
+- mutex_enter(&zvol_state_lock);
+- (void) __zvol_create_minor(dsname, B_TRUE);
+- mutex_exit(&zvol_state_lock);
++ (void) zvol_create_minor_impl(dsname);
+ break;
+ case ZFS_SNAPDEV_HIDDEN:
+- (void) zvol_remove_minor(dsname);
++ (void) zvol_remove_minor_impl(dsname);
+ break;
+ }
+
+ return (0);
+ }
+
++static void
++zvol_set_snapdev_impl(char *name, uint64_t snapdev)
++{
++ zvol_snapdev_cb_arg_t arg = {snapdev};
++ fstrans_cookie_t cookie = spl_fstrans_mark();
++ /*
++ * The zvol_set_snapdev_sync() sets snapdev appropriately
++ * in the dataset hierarchy. Here, we only scan snapshots.
++ */
++ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
++ spl_fstrans_unmark(cookie);
++}
++
++static zvol_task_t *
++zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
++ uint64_t snapdev)
++{
++ zvol_task_t *task;
++ char *delim;
++
++ /* Never allow tasks on hidden names. */
++ if (name1[0] == '$')
++ return (NULL);
++
++ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
++ task->op = op;
++ task->snapdev = snapdev;
++ delim = strchr(name1, '/');
++ strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
++
++ strlcpy(task->name1, name1, MAXNAMELEN);
++ if (name2 != NULL)
++ strlcpy(task->name2, name2, MAXNAMELEN);
++
++ return (task);
++}
++
++static void
++zvol_task_free(zvol_task_t *task)
++{
++ kmem_free(task, sizeof (zvol_task_t));
++}
++
++/*
++ * The worker thread function performed asynchronously.
++ */
++static void
++zvol_task_cb(void *param)
++{
++ zvol_task_t *task = (zvol_task_t *)param;
++
++ switch (task->op) {
++ case ZVOL_ASYNC_CREATE_MINORS:
++ (void) zvol_create_minors_impl(task->name1);
++ break;
++ case ZVOL_ASYNC_REMOVE_MINORS:
++ zvol_remove_minors_impl(task->name1);
++ break;
++ case ZVOL_ASYNC_RENAME_MINORS:
++ zvol_rename_minors_impl(task->name1, task->name2);
++ break;
++ case ZVOL_ASYNC_SET_SNAPDEV:
++ zvol_set_snapdev_impl(task->name1, task->snapdev);
++ break;
++ default:
++ VERIFY(0);
++ break;
++ }
++
++ zvol_task_free(task);
++}
++
++typedef struct zvol_set_snapdev_arg {
++ const char *zsda_name;
++ uint64_t zsda_value;
++ zprop_source_t zsda_source;
++ dmu_tx_t *zsda_tx;
++} zvol_set_snapdev_arg_t;
++
++/*
++ * Sanity check the dataset for safe use by the sync task. No additional
++ * conditions are imposed.
++ */
++static int
++zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
++{
++ zvol_set_snapdev_arg_t *zsda = arg;
++ dsl_pool_t *dp = dmu_tx_pool(tx);
++ dsl_dir_t *dd;
++ int error;
++
++ error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
++ if (error != 0)
++ return (error);
++
++ dsl_dir_rele(dd, FTAG);
++
++ return (error);
++}
++
++static int
++zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
++{
++ zvol_set_snapdev_arg_t *zsda = arg;
++ char dsname[MAXNAMELEN];
++ zvol_task_t *task;
++
++ dsl_dataset_name(ds, dsname);
++ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
++ zsda->zsda_source, sizeof (zsda->zsda_value), 1,
++ &zsda->zsda_value, zsda->zsda_tx);
++
++ task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
++ NULL, zsda->zsda_value);
++ if (task == NULL)
++ return (0);
++
++ (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
++ task, TQ_SLEEP);
++ return (0);
++}
++
++/*
++ * Traverse all child snapshot datasets and apply snapdev appropriately.
++ */
++static void
++zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
++{
++ zvol_set_snapdev_arg_t *zsda = arg;
++ dsl_pool_t *dp = dmu_tx_pool(tx);
++ dsl_dir_t *dd;
++
++ VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
++ zsda->zsda_tx = tx;
++
++ dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
++ zsda, DS_FIND_CHILDREN);
++
++ dsl_dir_rele(dd, FTAG);
++}
++
+ int
+-zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
+- fstrans_cookie_t cookie;
++zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
++{
++ zvol_set_snapdev_arg_t zsda;
+
+- if (zvol_inhibit_dev)
+- /* caller should continue to modify snapdev property */
+- return (-1);
++ zsda.zsda_name = ddname;
++ zsda.zsda_source = source;
++ zsda.zsda_value = snapdev;
+
+- cookie = spl_fstrans_mark();
+- (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
+- &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+- spl_fstrans_unmark(cookie);
++ return (dsl_sync_task(ddname, zvol_set_snapdev_check,
++ zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
++}
++
++void
++zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
++{
++ zvol_task_t *task;
++ taskqid_t id;
++
++ task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL);
++ if (task == NULL)
++ return;
++
++ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
++ if ((async == B_FALSE) && (id != 0))
++ taskq_wait_id(spa->spa_zvol_taskq, id);
++}
++
++void
++zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
++{
++ zvol_task_t *task;
++ taskqid_t id;
++
++ task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
++ if (task == NULL)
++ return;
+
+- /* caller should continue to modify snapdev property */
+- return (-1);
++ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
++ if ((async == B_FALSE) && (id != 0))
++ taskq_wait_id(spa->spa_zvol_taskq, id);
++}
++
++void
++zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
++ boolean_t async)
++{
++ zvol_task_t *task;
++ taskqid_t id;
++
++ task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
++ if (task == NULL)
++ return;
++
++ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
++ if ((async == B_FALSE) && (id != 0))
++ taskq_wait_id(spa->spa_zvol_taskq, id);
+ }
+
+ int
+@@ -1662,11 +1911,13 @@ out:
+ void
+ zvol_fini(void)
+ {
+- zvol_remove_minors(NULL);
++ zvol_remove_minors_impl(NULL);
++
+ blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
+ unregister_blkdev(zvol_major, ZVOL_DRIVER);
+- mutex_destroy(&zvol_state_lock);
++
+ list_destroy(&zvol_state_list);
++ mutex_destroy(&zvol_state_lock);
+ }
+
+ module_param(zvol_inhibit_dev, uint, 0644);
+diff --git a/scripts/zconfig.sh b/scripts/zconfig.sh
+index 45ccf62..1908dc1 100755
+--- a/scripts/zconfig.sh
++++ b/scripts/zconfig.sh
+@@ -217,15 +217,26 @@ test_3() {
+ zconfig_zvol_device_stat 10 ${POOL_NAME} ${FULL_ZVOL_NAME} \
+ ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 11
+
++ # Toggle the snapdev and observe snapshot device links toggled
++ ${ZFS} set snapdev=hidden ${FULL_ZVOL_NAME} || fail 12
++
++ zconfig_zvol_device_stat 7 ${POOL_NAME} ${FULL_ZVOL_NAME} \
++ "invalid" ${FULL_CLONE_NAME} || fail 13
++
++ ${ZFS} set snapdev=visible ${FULL_ZVOL_NAME} || fail 14
++
++ zconfig_zvol_device_stat 10 ${POOL_NAME} ${FULL_ZVOL_NAME} \
++ ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 15
++
+ # Destroy the pool and consequently the devices
+- ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 -d || fail 12
++ ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 -d || fail 16
+
+ # verify the devices were removed
+ zconfig_zvol_device_stat 0 ${POOL_NAME} ${FULL_ZVOL_NAME} \
+- ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 13
++ ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 17
+
+- ${ZFS_SH} -u || fail 14
+- rm -f ${TMP_CACHE} || fail 15
++ ${ZFS_SH} -u || fail 18
++ rm -f ${TMP_CACHE} || fail 19
+
+ pass
+ }
+--
+2.1.4
+
diff --git a/zfs-patches/series b/zfs-patches/series
index 735ea2d..fe93901 100644
--- a/zfs-patches/series
+++ b/zfs-patches/series
@@ -1,2 +1,5 @@
fix-control.patch
fix-dh-installinit.patch
+0001-Fix-lock-order-inversion-with-zvol_open.patch
+0002-Make-zvol-minor-functionality-more-robust.patch
+0003-Add-support-for-asynchronous-zvol-minor-operations.patch
--
2.1.4
More information about the pve-devel
mailing list