[pve-devel] [RFC PATCH zfsonlinux] Add backported rename/locking fixes

Fabian Grünbichler f.gruenbichler at proxmox.com
Mon Mar 21 16:48:44 CET 2016


backport three upstream commits related to zvol minor
operations, concurrency and locking. These should hopefully
fix the deadlock occuring on parallel zvol_open and rename
operations.
---
Tested with various parallel create, rename, destroy and
snapshot & rename commands in a loop. systemd-udevd itself
seems to also have a problem when rapidly creating and
destroying device nodes, but with these patches applied
the zfs side seems fixed (no more hanging zpools!).

Waiting for the device node to appear (/dev/zvol/<pool>/<vol>)
seems to work better than using udevadm settle/trigger.

Note: more sets of eyes and more testing will be required
before actually applying this!

 ...1-Fix-lock-order-inversion-with-zvol_open.patch |  111 ++
 ...Make-zvol-minor-functionality-more-robust.patch |  278 +++++
 ...rt-for-asynchronous-zvol-minor-operations.patch | 1293 ++++++++++++++++++++
 zfs-patches/series                                 |    3 +
 4 files changed, 1685 insertions(+)
 create mode 100644 zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch
 create mode 100644 zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch
 create mode 100644 zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch

diff --git a/zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch b/zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch
new file mode 100644
index 0000000..5ae1f3a
--- /dev/null
+++ b/zfs-patches/0001-Fix-lock-order-inversion-with-zvol_open.patch
@@ -0,0 +1,111 @@
+From 1ee159f423b5eb3c4646b0ba2dd0fb359503ba90 Mon Sep 17 00:00:00 2001
+From: Boris Protopopov <boris.protopopov at actifio.com>
+Date: Wed, 23 Sep 2015 12:34:51 -0400
+Subject: [PATCH] Fix lock order inversion with zvol_open()
+
+zfsonlinux issue #3681 - lock order inversion between zvol_open() and
+dsl_pool_sync()...zvol_rename_minors()
+
+Remove trylock of spa_namespace_lock as it is no longer needed when
+zvol minor operations are performed in a separate context with no
+prior locking state; the spa_namespace_lock is no longer held
+when bdev->bd_mutex or zfs_state_lock might be taken in the code
+paths originating from the zvol minor operation callbacks.
+
+Signed-off-by: Boris Protopopov <boris.protopopov at actifio.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Closes #3681
+
+Backported-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ module/zfs/zvol.c | 57 ++++++++++++++++---------------------------------------
+ 1 file changed, 16 insertions(+), 41 deletions(-)
+
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index 613b47e..6996242 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -874,54 +874,27 @@ zvol_first_open(zvol_state_t *zv)
+ {
+ 	objset_t *os;
+ 	uint64_t volsize;
+-	int locked = 0;
+ 	int error;
+ 	uint64_t ro;
+ 
+-	/*
+-	 * In all other cases the spa_namespace_lock is taken before the
+-	 * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
+-	 * function calls fops->open() with the bdev->bd_mutex lock held.
+-	 *
+-	 * To avoid a potential lock inversion deadlock we preemptively
+-	 * try to take the spa_namespace_lock().  Normally it will not
+-	 * be contended and this is safe because spa_open_common() handles
+-	 * the case where the caller already holds the spa_namespace_lock.
+-	 *
+-	 * When it is contended we risk a lock inversion if we were to
+-	 * block waiting for the lock.  Luckily, the __blkdev_get()
+-	 * function allows us to return -ERESTARTSYS which will result in
+-	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+-	 * called again.  This process can be repeated safely until both
+-	 * locks are acquired.
+-	 */
+-	if (!mutex_owned(&spa_namespace_lock)) {
+-		locked = mutex_tryenter(&spa_namespace_lock);
+-		if (!locked)
+-			return (-SET_ERROR(ERESTARTSYS));
+-	}
+-
+-	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
+-	if (error)
+-		goto out_mutex;
+-
+ 	/* lie and say we're read-only */
+ 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
+ 	if (error)
+-		goto out_mutex;
++		return (SET_ERROR(-error));
++
++	zv->zv_objset = os;
++
++	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
++	if (error)
++		goto out_owned;
+ 
+ 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+-	if (error) {
+-		dmu_objset_disown(os, zvol_tag);
+-		goto out_mutex;
+-	}
++	if (error)
++		goto out_owned;
+ 
+-	zv->zv_objset = os;
+ 	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+-	if (error) {
+-		dmu_objset_disown(os, zvol_tag);
+-		goto out_mutex;
+-	}
++	if (error)
++		goto out_owned;
+ 
+ 	set_capacity(zv->zv_disk, volsize >> 9);
+ 	zv->zv_volsize = volsize;
+@@ -936,9 +909,11 @@ zvol_first_open(zvol_state_t *zv)
+ 		zv->zv_flags &= ~ZVOL_RDONLY;
+ 	}
+ 
+-out_mutex:
+-	if (locked)
+-		mutex_exit(&spa_namespace_lock);
++out_owned:
++	if (error) {
++		dmu_objset_disown(os, zvol_tag);
++		zv->zv_objset = NULL;
++	}
+ 
+ 	return (SET_ERROR(-error));
+ }
+-- 
+2.1.4
+
diff --git a/zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch b/zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch
new file mode 100644
index 0000000..1861b73
--- /dev/null
+++ b/zfs-patches/0002-Make-zvol-minor-functionality-more-robust.patch
@@ -0,0 +1,278 @@
+From 5428dc51fb55145fbac1c142402dafc11d1e7d28 Mon Sep 17 00:00:00 2001
+From: Boris Protopopov <boris.protopopov at actifio.com>
+Date: Tue, 16 Feb 2016 14:52:55 -0500
+Subject: [PATCH] Make zvol minor functionality more robust
+
+Close the race window in zvol_open() to prevent removal of
+zvol_state in the 'first open' code path. Move the call to
+check_disk_change() under zvol_state_lock to make sure the
+zvol_media_changed() and zvol_revalidate_disk() called by
+check_disk_change() are invoked with positive zv_open_count.
+
+Skip opened zvols when removing minors and set private_data
+to NULL for zvols that are not in use whose minors are being
+removed, to indicate to zvol_open() that the state is gone.
+Skip opened zvols when renaming minors to avoid modifying
+zv_name that might be in use, e.g. in zvol_ioctl().
+
+Drop zvol_state_lock before calling add_disk() when creating
+minors to avoid deadlocks with zvol_open().
+
+Wrap dmu_objset_find() with spl_fstran_mark()/unmark().
+
+Signed-off-by: Boris Protopopov <boris.protopopov at actifio.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Signed-off-by: Richard Yao <ryao at gentoo.org>
+Closes #4344
+
+Backported-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ module/zfs/zvol.c | 94 +++++++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 77 insertions(+), 17 deletions(-)
+
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index 6996242..8b0be3f 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -33,6 +33,8 @@
+  *
+  * Volumes are persistent through reboot and module load.  No user command
+  * needs to be run before opening and using a device.
++ *
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #include <sys/dbuf.h>
+@@ -599,6 +601,8 @@ zvol_write(struct bio *bio)
+ 	dmu_tx_t *tx;
+ 	rl_t *rl;
+ 
++	ASSERT(zv && zv->zv_open_count > 0);
++
+ 	if (bio->bi_rw & VDEV_REQ_FLUSH)
+ 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ 
+@@ -647,6 +651,8 @@ zvol_discard(struct bio *bio)
+ 	int error;
+ 	rl_t *rl;
+ 
++	ASSERT(zv && zv->zv_open_count > 0);
++
+ 	if (end > zv->zv_volsize)
+ 		return (SET_ERROR(EIO));
+ 
+@@ -693,7 +699,7 @@ zvol_read(struct bio *bio)
+ 
+ 	if (len == 0)
+ 		return (0);
+-
++	ASSERT(zv && zv->zv_open_count > 0);
+ 
+ 	rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
+ 
+@@ -942,7 +948,7 @@ zvol_last_close(zvol_state_t *zv)
+ static int
+ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+-	zvol_state_t *zv = bdev->bd_disk->private_data;
++	zvol_state_t *zv;
+ 	int error = 0, drop_mutex = 0;
+ 
+ 	/*
+@@ -956,7 +962,17 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ 		drop_mutex = 1;
+ 	}
+ 
+-	ASSERT3P(zv, !=, NULL);
++	/*
++	 * Obtain a copy of private_data under the lock to make sure
++	 * that either the result of zvol_freeg() setting
++	 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
++	 * is not called on this zv because of the positive zv_open_count.
++	 */
++	zv = bdev->bd_disk->private_data;
++	if (zv == NULL) {
++		error = -ENXIO;
++		goto out_mutex;
++	}
+ 
+ 	if (zv->zv_open_count == 0) {
+ 		error = zvol_first_open(zv);
+@@ -971,6 +987,8 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ 
+ 	zv->zv_open_count++;
+ 
++	check_disk_change(bdev);
++
+ out_open_count:
+ 	if (zv->zv_open_count == 0)
+ 		zvol_last_close(zv);
+@@ -979,8 +997,6 @@ out_mutex:
+ 	if (drop_mutex)
+ 		mutex_exit(&zvol_state_lock);
+ 
+-	check_disk_change(bdev);
+-
+ 	return (SET_ERROR(error));
+ }
+ 
+@@ -994,16 +1010,16 @@ zvol_release(struct gendisk *disk, fmode_t mode)
+ 	zvol_state_t *zv = disk->private_data;
+ 	int drop_mutex = 0;
+ 
++	ASSERT(zv && zv->zv_open_count > 0);
++
+ 	if (!mutex_owned(&zvol_state_lock)) {
+ 		mutex_enter(&zvol_state_lock);
+ 		drop_mutex = 1;
+ 	}
+ 
+-	if (zv->zv_open_count > 0) {
+-		zv->zv_open_count--;
+-		if (zv->zv_open_count == 0)
+-			zvol_last_close(zv);
+-	}
++	zv->zv_open_count--;
++	if (zv->zv_open_count == 0)
++		zvol_last_close(zv);
+ 
+ 	if (drop_mutex)
+ 		mutex_exit(&zvol_state_lock);
+@@ -1020,8 +1036,7 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
+ 	zvol_state_t *zv = bdev->bd_disk->private_data;
+ 	int error = 0;
+ 
+-	if (zv == NULL)
+-		return (SET_ERROR(-ENXIO));
++	ASSERT(zv && zv->zv_open_count > 0);
+ 
+ 	switch (cmd) {
+ 	case BLKFLSBUF:
+@@ -1055,6 +1070,8 @@ static int zvol_media_changed(struct gendisk *disk)
+ {
+ 	zvol_state_t *zv = disk->private_data;
+ 
++	ASSERT(zv && zv->zv_open_count > 0);
++
+ 	return (zv->zv_changed);
+ }
+ 
+@@ -1062,6 +1079,8 @@ static int zvol_revalidate_disk(struct gendisk *disk)
+ {
+ 	zvol_state_t *zv = disk->private_data;
+ 
++	ASSERT(zv && zv->zv_open_count > 0);
++
+ 	zv->zv_changed = 0;
+ 	set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
+ 
+@@ -1078,7 +1097,11 @@ static int
+ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+ {
+ 	zvol_state_t *zv = bdev->bd_disk->private_data;
+-	sector_t sectors = get_capacity(zv->zv_disk);
++	sector_t sectors;
++
++	ASSERT(zv && zv->zv_open_count > 0);
++
++	sectors = get_capacity(zv->zv_disk);
+ 
+ 	if (sectors > 2048) {
+ 		geo->heads = 16;
+@@ -1235,9 +1258,14 @@ out_kmem:
+ static void
+ zvol_free(zvol_state_t *zv)
+ {
++	ASSERT(MUTEX_HELD(&zvol_state_lock));
++	ASSERT(zv->zv_open_count == 0);
++
+ 	avl_destroy(&zv->zv_znode.z_range_avl);
+ 	mutex_destroy(&zv->zv_znode.z_range_lock);
+ 
++	zv->zv_disk->private_data = NULL;
++
+ 	del_gendisk(zv->zv_disk);
+ 	blk_cleanup_queue(zv->zv_queue);
+ 	put_disk(zv->zv_disk);
+@@ -1370,7 +1398,15 @@ out:
+ 
+ 	if (error == 0) {
+ 		zvol_insert(zv);
++		/*
++		 * Drop the lock to prevent deadlock with sys_open() ->
++		 * zvol_open(), which first takes bd_disk->bd_mutex and then
++		 * takes zvol_state_lock, whereas this code path first takes
++		 * zvol_state_lock, and then takes bd_disk->bd_mutex.
++		 */
++		mutex_exit(&zvol_state_lock);
+ 		add_disk(zv->zv_disk);
++		mutex_enter(&zvol_state_lock);
+ 	}
+ 
+ 	return (SET_ERROR(error));
+@@ -1467,10 +1503,15 @@ int
+ zvol_create_minors(const char *name)
+ {
+ 	int error = 0;
++	fstrans_cookie_t cookie;
+ 
+-	if (!zvol_inhibit_dev)
+-		error = dmu_objset_find((char *)name, zvol_create_minors_cb,
+-		    NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
++	if (zvol_inhibit_dev)
++		return (0);
++
++	cookie = spl_fstrans_mark();
++	error = dmu_objset_find((char *)name, zvol_create_minors_cb,
++	    NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
++	spl_fstrans_unmark(cookie);
+ 
+ 	return (SET_ERROR(error));
+ }
+@@ -1494,7 +1535,13 @@ zvol_remove_minors(const char *name)
+ 
+ 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
+ 		    (strncmp(zv->zv_name, name, namelen) == 0 &&
+-		    zv->zv_name[namelen] == '/')) {
++		    (zv->zv_name[namelen] == '/' ||
++		    zv->zv_name[namelen] == '@'))) {
++
++			/* If in use, leave alone */
++			if (zv->zv_open_count > 0)
++				continue;
++
+ 			zvol_remove(zv);
+ 			zvol_free(zv);
+ 		}
+@@ -1525,6 +1572,10 @@ zvol_rename_minors(const char *oldname, const char *newname)
+ 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+ 		zv_next = list_next(&zvol_state_list, zv);
+ 
++		/* If in use, leave alone */
++		if (zv->zv_open_count > 0)
++			continue;
++
+ 		if (strcmp(zv->zv_name, oldname) == 0) {
+ 			__zvol_rename_minor(zv, newname);
+ 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+@@ -1565,8 +1616,17 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
+ 
+ int
+ zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
++	fstrans_cookie_t cookie;
++
++	if (zvol_inhibit_dev)
++		/* caller should continue to modify snapdev property */
++		return (-1);
++
++	cookie = spl_fstrans_mark();
+ 	(void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
+ 		&snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
++	spl_fstrans_unmark(cookie);
++
+ 	/* caller should continue to modify snapdev property */
+ 	return (-1);
+ }
+-- 
+2.1.4
+
diff --git a/zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch b/zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch
new file mode 100644
index 0000000..be97aab
--- /dev/null
+++ b/zfs-patches/0003-Add-support-for-asynchronous-zvol-minor-operations.patch
@@ -0,0 +1,1293 @@
+From a0bd735adb1b1eb81fef10b4db102ee051c4d4ff Mon Sep 17 00:00:00 2001
+From: Boris Protopopov <boris.protopopov at actifio.com>
+Date: Sat, 22 Mar 2014 05:07:14 -0400
+Subject: [PATCH] Add support for asynchronous zvol minor operations
+
+zfsonlinux issue #2217 - zvol minor operations: check snapdev
+property before traversing snapshots of a dataset
+
+zfsonlinux issue #3681 - lock order inversion between zvol_open()
+and dsl_pool_sync()...zvol_rename_minors()
+
+Create a per-pool zvol taskq for asynchronous zvol tasks.
+There are a few key design decisions to be aware of.
+
+* Each taskq must be single threaded to ensure tasks are always
+  processed in the order in which they were dispatched.
+
+* There is a taskq per-pool in order to keep the pools independent.
+  This way if one pool is suspended it will not impact another.
+
+* The preferred location to dispatch a zvol minor task is a sync
+  task.  In this context there is easy access to the spa_t and
+  minimal error handling is required because the sync task must
+  succeed.
+
+Support for asynchronous zvol minor operations address issue #3681.
+
+Signed-off-by: Boris Protopopov <boris.protopopov at actifio.com>
+Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>
+Closes #2217
+Closes #3678
+Closes #3681
+
+Backported-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ include/sys/spa_impl.h   |   2 +
+ include/sys/zvol.h       |  15 +-
+ lib/libzpool/kernel.c    |  22 +++
+ module/zfs/dmu_objset.c  |   4 +
+ module/zfs/dmu_send.c    |   3 +
+ module/zfs/dsl_dataset.c |  36 +---
+ module/zfs/dsl_destroy.c |   8 +-
+ module/zfs/dsl_dir.c     |   6 +-
+ module/zfs/spa.c         |  37 +++-
+ module/zfs/zfs_ioctl.c   |  50 ++---
+ module/zfs/zvol.c        | 493 +++++++++++++++++++++++++++++++++++------------
+ scripts/zconfig.sh       |  19 +-
+ 12 files changed, 481 insertions(+), 214 deletions(-)
+
+diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
+index 0b49c71..5176eb8 100644
+--- a/include/sys/spa_impl.h
++++ b/include/sys/spa_impl.h
+@@ -23,6 +23,7 @@
+  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #ifndef _SYS_SPA_IMPL_H
+@@ -252,6 +253,7 @@ struct spa {
+ 	uint64_t	spa_deadman_synctime;	/* deadman expiration timer */
+ 	uint64_t	spa_errata;		/* errata issues detected */
+ 	spa_stats_t	spa_stats;		/* assorted spa statistics */
++	taskq_t		*spa_zvol_taskq;	/* Taskq for minor managment */
+ 
+ 	/*
+ 	 * spa_refcount & spa_config_lock must be the last elements
+diff --git a/include/sys/zvol.h b/include/sys/zvol.h
+index 898e235..c3e386f 100644
+--- a/include/sys/zvol.h
++++ b/include/sys/zvol.h
+@@ -21,6 +21,7 @@
+ 
+ /*
+  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #ifndef	_SYS_ZVOL_H
+@@ -31,24 +32,22 @@
+ #define	ZVOL_OBJ		1ULL
+ #define	ZVOL_ZAP_OBJ		2ULL
+ 
+-#ifdef _KERNEL
++extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async);
++extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async);
++extern void zvol_rename_minors(spa_t *spa, const char *oldname,
++    const char *newname, boolean_t async);
+ 
++#ifdef _KERNEL
+ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
+ extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize);
+ extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
+ extern boolean_t zvol_is_zvol(const char *);
+ extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+-extern int zvol_create_minor(const char *name);
+-extern int zvol_create_minors(const char *name);
+-extern int zvol_remove_minor(const char *name);
+-extern void zvol_remove_minors(const char *name);
+-extern void zvol_rename_minors(const char *oldname, const char *newname);
+ extern int zvol_set_volsize(const char *, uint64_t);
+ extern int zvol_set_volblocksize(const char *, uint64_t);
+-extern int zvol_set_snapdev(const char *, uint64_t);
++extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t);
+ 
+ extern int zvol_init(void);
+ extern void zvol_fini(void);
+-
+ #endif /* _KERNEL */
+ #endif /* _SYS_ZVOL_H */
+diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
+index a451026..f1ce8b3 100644
+--- a/lib/libzpool/kernel.c
++++ b/lib/libzpool/kernel.c
+@@ -20,6 +20,7 @@
+  */
+ /*
+  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #include <assert.h>
+@@ -1322,3 +1323,24 @@ spl_fstrans_check(void)
+ {
+ 	return (0);
+ }
++
++void
++zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
++{
++}
++
++void
++zvol_remove_minor(spa_t *spa, const char *name, boolean_t async)
++{
++}
++
++void
++zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
++{
++}
++
++void
++zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
++    boolean_t async)
++{
++}
+diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
+index 779b3bb..f2d492e 100644
+--- a/module/zfs/dmu_objset.c
++++ b/module/zfs/dmu_objset.c
+@@ -26,6 +26,7 @@
+  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+  * Copyright (c) 2015 Nexenta Systems, Inc. All rights reserved.
+  * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ /* Portions Copyright 2010 Robert Milkowski */
+@@ -900,6 +901,8 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
+ 	}
+ 
+ 	spa_history_log_internal_ds(ds, "create", tx, "");
++	zvol_create_minors(dp->dp_spa, doca->doca_name, B_TRUE);
++
+ 	dsl_dataset_rele(ds, FTAG);
+ 	dsl_dir_rele(pdd, FTAG);
+ }
+@@ -993,6 +996,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
+ 	dsl_dataset_name(origin, namebuf);
+ 	spa_history_log_internal_ds(ds, "clone", tx,
+ 	    "origin=%s (%llu)", namebuf, origin->ds_object);
++	zvol_create_minors(dp->dp_spa, doca->doca_clone, B_TRUE);
+ 	dsl_dataset_rele(ds, FTAG);
+ 	dsl_dataset_rele(origin, FTAG);
+ 	dsl_dir_rele(pdd, FTAG);
+diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
+index b2d844e..6a349c6 100644
+--- a/module/zfs/dmu_send.c
++++ b/module/zfs/dmu_send.c
+@@ -24,6 +24,7 @@
+  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #include <sys/dmu.h>
+@@ -53,6 +54,7 @@
+ #include <sys/blkptr.h>
+ #include <sys/dsl_bookmark.h>
+ #include <sys/zfeature.h>
++#include <sys/zvol.h>
+ 
+ /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+ int zfs_send_corrupt_data = B_FALSE;
+@@ -2162,6 +2164,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
+ 		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ 	}
+ 	drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
++	zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
+ 	/*
+ 	 * Release the hold from dmu_recv_begin.  This must be done before
+ 	 * we return to open context, so that when we free the dataset's dnode,
+diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
+index 2168f28..a063885 100644
+--- a/module/zfs/dsl_dataset.c
++++ b/module/zfs/dsl_dataset.c
+@@ -24,6 +24,7 @@
+  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+  * Copyright (c) 2014 RackTop Systems.
+  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #include <sys/dmu_objset.h>
+@@ -1376,6 +1377,7 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
+ 			dsl_props_set_sync_impl(ds->ds_prev,
+ 			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+ 		}
++		zvol_create_minors(dp->dp_spa, nvpair_name(pair), B_TRUE);
+ 		dsl_dataset_rele(ds, FTAG);
+ 	}
+ }
+@@ -1450,16 +1452,6 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+ 		fnvlist_free(suspended);
+ 	}
+ 
+-#ifdef _KERNEL
+-	if (error == 0) {
+-		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+-		    pair = nvlist_next_nvpair(snaps, pair)) {
+-			char *snapname = nvpair_name(pair);
+-			zvol_create_minors(snapname);
+-		}
+-	}
+-#endif
+-
+ 	return (error);
+ }
+ 
+@@ -1870,6 +1862,8 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+ 	VERIFY0(zap_add(dp->dp_meta_objset,
+ 	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
+ 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
++	zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname,
++	    ddrsa->ddrsa_newsnapname, B_TRUE);
+ 
+ 	dsl_dataset_rele(ds, FTAG);
+ 	return (0);
+@@ -1898,11 +1892,6 @@ int
+ dsl_dataset_rename_snapshot(const char *fsname,
+     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+ {
+-#ifdef _KERNEL
+-	char *oldname, *newname;
+-#endif
+-	int error;
+-
+ 	dsl_dataset_rename_snapshot_arg_t ddrsa;
+ 
+ 	ddrsa.ddrsa_fsname = fsname;
+@@ -1910,22 +1899,9 @@ dsl_dataset_rename_snapshot(const char *fsname,
+ 	ddrsa.ddrsa_newsnapname = newsnapname;
+ 	ddrsa.ddrsa_recursive = recursive;
+ 
+-	error = dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
++	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+ 	    dsl_dataset_rename_snapshot_sync, &ddrsa,
+-	    1, ZFS_SPACE_CHECK_RESERVED);
+-
+-	if (error)
+-	    return (SET_ERROR(error));
+-
+-#ifdef _KERNEL
+-	oldname = kmem_asprintf("%s@%s", fsname, oldsnapname);
+-	newname = kmem_asprintf("%s@%s", fsname, newsnapname);
+-	zvol_rename_minors(oldname, newname);
+-	strfree(newname);
+-	strfree(oldname);
+-#endif
+-
+-	return (0);
++	    1, ZFS_SPACE_CHECK_RESERVED));
+ }
+ 
+ /*
+diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
+index 0e2238f..34d076e 100644
+--- a/module/zfs/dsl_destroy.c
++++ b/module/zfs/dsl_destroy.c
+@@ -23,6 +23,7 @@
+  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #include <sys/zfs_context.h>
+@@ -40,6 +41,7 @@
+ #include <sys/zfs_ioctl.h>
+ #include <sys/dsl_deleg.h>
+ #include <sys/dmu_impl.h>
++#include <sys/zvol.h>
+ 
+ typedef struct dmu_snapshots_destroy_arg {
+ 	nvlist_t *dsda_snaps;
+@@ -243,9 +245,6 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+ void
+ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+ {
+-#ifdef ZFS_DEBUG
+-	int err;
+-#endif
+ 	int after_branch_point = FALSE;
+ 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ 	objset_t *mos = dp->dp_meta_objset;
+@@ -438,6 +437,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+ #ifdef ZFS_DEBUG
+ 	{
+ 		uint64_t val;
++		int err;
+ 
+ 		err = dsl_dataset_snap_lookup(ds_head,
+ 		    ds->ds_snapname, &val);
+@@ -487,6 +487,7 @@ dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+ 		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+ 
+ 		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
++		zvol_remove_minors(dp->dp_spa, nvpair_name(pair), B_TRUE);
+ 		dsl_dataset_rele(ds, FTAG);
+ 	}
+ }
+@@ -881,6 +882,7 @@ dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+ 
+ 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+ 	dsl_destroy_head_sync_impl(ds, tx);
++	zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE);
+ 	dsl_dataset_rele(ds, FTAG);
+ }
+ 
+diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
+index ba6c244..762e2e5 100644
+--- a/module/zfs/dsl_dir.c
++++ b/module/zfs/dsl_dir.c
+@@ -24,6 +24,7 @@
+  * Copyright (c) 2013 Martin Matuska. All rights reserved.
+  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ #include <sys/dmu.h>
+@@ -1913,9 +1914,8 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
+ 	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
+ 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
+ 
+-#ifdef _KERNEL
+-	zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
+-#endif
++	zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
++	    ddra->ddra_newname, B_TRUE);
+ 
+ 	dsl_prop_notify_all(dd);
+ 
+diff --git a/module/zfs/spa.c b/module/zfs/spa.c
+index b4831a7..4468cdf 100644
+--- a/module/zfs/spa.c
++++ b/module/zfs/spa.c
+@@ -24,6 +24,7 @@
+  * Copyright (c) 2013 by Delphix. All rights reserved.
+  * Copyright (c) 2013, 2014, Nexenta Systems, Inc.  All rights reserved.
+  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ /*
+@@ -1131,6 +1132,24 @@ spa_activate(spa_t *spa, int mode)
+ 	avl_create(&spa->spa_errlist_last,
+ 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+ 	    offsetof(spa_error_entry_t, se_avl));
++
++	/*
++	 * This taskq is used to perform zvol-minor-related tasks
++	 * asynchronously. This has several advantages, including easy
++	 * resolution of various deadlocks (zfsonlinux bug #3681).
++	 *
++	 * The taskq must be single threaded to ensure tasks are always
++	 * processed in the order in which they were dispatched.
++	 *
++	 * A taskq per pool allows one to keep the pools independent.
++	 * This way if one pool is suspended, it will not impact another.
++	 *
++	 * The preferred location to dispatch a zvol minor task is a sync
++	 * task. In this context, there is easy access to the spa_t and minimal
++	 * error handling is required because the sync task must succeed.
++	 */
++	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
++	    1, INT_MAX, 0);
+ }
+ 
+ /*
+@@ -1149,6 +1168,11 @@ spa_deactivate(spa_t *spa)
+ 
+ 	spa_evicting_os_wait(spa);
+ 
++	if (spa->spa_zvol_taskq) {
++		taskq_destroy(spa->spa_zvol_taskq);
++		spa->spa_zvol_taskq = NULL;
++	}
++
+ 	txg_list_destroy(&spa->spa_vdev_txg_list);
+ 
+ 	list_destroy(&spa->spa_config_dirty_list);
+@@ -3083,10 +3107,8 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+ 		mutex_exit(&spa_namespace_lock);
+ 	}
+ 
+-#ifdef _KERNEL
+ 	if (firstopen)
+-		zvol_create_minors(spa->spa_name);
+-#endif
++		zvol_create_minors(spa, spa_name(spa), B_TRUE);
+ 
+ 	*spapp = spa;
+ 
+@@ -4206,10 +4228,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
+ 
+ 	mutex_exit(&spa_namespace_lock);
+ 	spa_history_log_version(spa, "import");
+-
+-#ifdef _KERNEL
+-	zvol_create_minors(pool);
+-#endif
++	zvol_create_minors(spa, pool, B_TRUE);
+ 
+ 	return (0);
+ }
+@@ -4344,6 +4363,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
+ 	spa_open_ref(spa, FTAG);
+ 	mutex_exit(&spa_namespace_lock);
+ 	spa_async_suspend(spa);
++	if (spa->spa_zvol_taskq) {
++		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
++		taskq_wait(spa->spa_zvol_taskq);
++	}
+ 	mutex_enter(&spa_namespace_lock);
+ 	spa_close(spa, FTAG);
+ 
+diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
+index d026e92..12d2750 100644
+--- a/module/zfs/zfs_ioctl.c
++++ b/module/zfs/zfs_ioctl.c
+@@ -30,6 +30,7 @@
+  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
++ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+  */
+ 
+ /*
+@@ -1500,8 +1501,7 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+ 	int error;
+ 	zfs_log_history(zc);
+ 	error = spa_destroy(zc->zc_name);
+-	if (error == 0)
+-		zvol_remove_minors(zc->zc_name);
++
+ 	return (error);
+ }
+ 
+@@ -1553,8 +1553,7 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
+ 
+ 	zfs_log_history(zc);
+ 	error = spa_export(zc->zc_name, NULL, force, hardforce);
+-	if (error == 0)
+-		zvol_remove_minors(zc->zc_name);
++
+ 	return (error);
+ }
+ 
+@@ -2395,7 +2394,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
+ 		err = zvol_set_volsize(dsname, intval);
+ 		break;
+ 	case ZFS_PROP_SNAPDEV:
+-		err = zvol_set_snapdev(dsname, intval);
++		err = zvol_set_snapdev(dsname, source, intval);
+ 		break;
+ 	case ZFS_PROP_VERSION:
+ 	{
+@@ -3189,12 +3188,6 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+ 		if (error != 0)
+ 			(void) dsl_destroy_head(fsname);
+ 	}
+-
+-#ifdef _KERNEL
+-	if (error == 0 && type == DMU_OST_ZVOL)
+-		zvol_create_minors(fsname);
+-#endif
+-
+ 	return (error);
+ }
+ 
+@@ -3237,12 +3230,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+ 		if (error != 0)
+ 			(void) dsl_destroy_head(fsname);
+ 	}
+-
+-#ifdef _KERNEL
+-	if (error == 0)
+-		zvol_create_minors(fsname);
+-#endif
+-
+ 	return (error);
+ }
+ 
+@@ -3305,11 +3292,6 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+ 
+ 	error = dsl_dataset_snapshot(snaps, props, outnvl);
+ 
+-#ifdef _KERNEL
+-	if (error == 0)
+-		zvol_create_minors(poolname);
+-#endif
+-
+ 	return (error);
+ }
+ 
+@@ -3435,7 +3417,6 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+ 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+ 	    pair = nvlist_next_nvpair(snaps, pair)) {
+ 		(void) zfs_unmount_snap(nvpair_name(pair));
+-		(void) zvol_remove_minor(nvpair_name(pair));
+ 	}
+ 
+ 	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
+@@ -3561,8 +3542,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
+ 		err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
+ 	else
+ 		err = dsl_destroy_head(zc->zc_name);
+-	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+-		(void) zvol_remove_minor(zc->zc_name);
++
+ 	return (err);
+ }
+ 
+@@ -4128,11 +4108,6 @@ zfs_ioc_recv(zfs_cmd_t *zc)
+ 	}
+ #endif
+ 
+-#ifdef _KERNEL
+-	if (error == 0)
+-		zvol_create_minors(tofs);
+-#endif
+-
+ 	/*
+ 	 * On error, restore the original props.
+ 	 */
+@@ -6018,16 +5993,16 @@ _init(void)
+ 		return (error);
+ 	}
+ 
++	if ((error = -zvol_init()) != 0)
++		return (error);
++
+ 	spa_init(FREAD | FWRITE);
+ 	zfs_init();
+ 
+-	if ((error = -zvol_init()) != 0)
+-		goto out1;
+-
+ 	zfs_ioctl_init();
+ 
+ 	if ((error = zfs_attach()) != 0)
+-		goto out2;
++		goto out;
+ 
+ 	tsd_create(&zfs_fsyncer_key, NULL);
+ 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+@@ -6043,11 +6018,10 @@ _init(void)
+ 
+ 	return (0);
+ 
+-out2:
+-	(void) zvol_fini();
+-out1:
++out:
+ 	zfs_fini();
+ 	spa_fini();
++	(void) zvol_fini();
+ 	printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s"
+ 	    ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE,
+ 	    ZFS_DEBUG_STR, error);
+@@ -6059,9 +6033,9 @@ static void __exit
+ _fini(void)
+ {
+ 	zfs_detach();
+-	zvol_fini();
+ 	zfs_fini();
+ 	spa_fini();
++	zvol_fini();
+ 
+ 	tsd_destroy(&zfs_fsyncer_key);
+ 	tsd_destroy(&rrw_tsd_key);
+diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
+index 8b0be3f..6e33106 100644
+--- a/module/zfs/zvol.c
++++ b/module/zfs/zvol.c
+@@ -41,12 +41,15 @@
+ #include <sys/dmu_traverse.h>
+ #include <sys/dsl_dataset.h>
+ #include <sys/dsl_prop.h>
++#include <sys/dsl_dir.h>
+ #include <sys/zap.h>
+ #include <sys/zfeature.h>
+ #include <sys/zil_impl.h>
++#include <sys/dmu_tx.h>
+ #include <sys/zio.h>
+ #include <sys/zfs_rlock.h>
+ #include <sys/zfs_znode.h>
++#include <sys/spa_impl.h>
+ #include <sys/zvol.h>
+ #include <linux/blkdev_compat.h>
+ 
+@@ -80,6 +83,23 @@ typedef struct zvol_state {
+ 	list_node_t		zv_next;	/* next zvol_state_t linkage */
+ } zvol_state_t;
+ 
++typedef enum {
++	ZVOL_ASYNC_CREATE_MINORS,
++	ZVOL_ASYNC_REMOVE_MINORS,
++	ZVOL_ASYNC_RENAME_MINORS,
++	ZVOL_ASYNC_SET_SNAPDEV,
++	ZVOL_ASYNC_MAX
++} zvol_async_op_t;
++
++typedef struct {
++	zvol_async_op_t op;
++	char pool[MAXNAMELEN];
++	char name1[MAXNAMELEN];
++	char name2[MAXNAMELEN];
++	zprop_source_t source;
++	uint64_t snapdev;
++} zvol_task_t;
++
+ #define	ZVOL_RDONLY	0x1
+ 
+ /*
+@@ -953,7 +973,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ 
+ 	/*
+ 	 * If the caller is already holding the mutex do not take it
+-	 * again, this will happen as part of zvol_create_minor().
++	 * again, this will happen as part of zvol_create_minor_impl().
+ 	 * Once add_disk() is called the device is live and the kernel
+ 	 * will attempt to open it to read the partition information.
+ 	 */
+@@ -1273,31 +1293,13 @@ zvol_free(zvol_state_t *zv)
+ 	kmem_free(zv, sizeof (zvol_state_t));
+ }
+ 
++/*
++ * Create a block device minor node and setup the linkage between it
++ * and the specified volume.  Once this function returns the block
++ * device is live and ready for use.
++ */
+ static int
+-__zvol_snapdev_hidden(const char *name)
+-{
+-	uint64_t snapdev;
+-	char *parent;
+-	char *atp;
+-	int error = 0;
+-
+-	parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+-	(void) strlcpy(parent, name, MAXPATHLEN);
+-
+-	if ((atp = strrchr(parent, '@')) != NULL) {
+-		*atp = '\0';
+-		error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
+-		if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
+-			error = SET_ERROR(ENODEV);
+-	}
+-
+-	kmem_free(parent, MAXPATHLEN);
+-
+-	return (SET_ERROR(error));
+-}
+-
+-static int
+-__zvol_create_minor(const char *name, boolean_t ignore_snapdev)
++zvol_create_minor_impl(const char *name)
+ {
+ 	zvol_state_t *zv;
+ 	objset_t *os;
+@@ -1307,7 +1309,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
+ 	unsigned minor = 0;
+ 	int error = 0;
+ 
+-	ASSERT(MUTEX_HELD(&zvol_state_lock));
++	mutex_enter(&zvol_state_lock);
+ 
+ 	zv = zvol_find_by_name(name);
+ 	if (zv) {
+@@ -1315,12 +1317,6 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
+ 		goto out;
+ 	}
+ 
+-	if (ignore_snapdev == B_FALSE) {
+-		error = __zvol_snapdev_hidden(name);
+-		if (error)
+-			goto out;
+-	}
+-
+ 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+ 
+ 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
+@@ -1406,69 +1402,18 @@ out:
+ 		 */
+ 		mutex_exit(&zvol_state_lock);
+ 		add_disk(zv->zv_disk);
+-		mutex_enter(&zvol_state_lock);
++	} else {
++		mutex_exit(&zvol_state_lock);
+ 	}
+ 
+ 	return (SET_ERROR(error));
+ }
+ 
+ /*
+- * Create a block device minor node and setup the linkage between it
+- * and the specified volume.  Once this function returns the block
+- * device is live and ready for use.
+- */
+-int
+-zvol_create_minor(const char *name)
+-{
+-	int error;
+-
+-	mutex_enter(&zvol_state_lock);
+-	error = __zvol_create_minor(name, B_FALSE);
+-	mutex_exit(&zvol_state_lock);
+-
+-	return (SET_ERROR(error));
+-}
+-
+-static int
+-__zvol_remove_minor(const char *name)
+-{
+-	zvol_state_t *zv;
+-
+-	ASSERT(MUTEX_HELD(&zvol_state_lock));
+-
+-	zv = zvol_find_by_name(name);
+-	if (zv == NULL)
+-		return (SET_ERROR(ENXIO));
+-
+-	if (zv->zv_open_count > 0)
+-		return (SET_ERROR(EBUSY));
+-
+-	zvol_remove(zv);
+-	zvol_free(zv);
+-
+-	return (0);
+-}
+-
+-/*
+- * Remove a block device minor node for the specified volume.
+- */
+-int
+-zvol_remove_minor(const char *name)
+-{
+-	int error;
+-
+-	mutex_enter(&zvol_state_lock);
+-	error = __zvol_remove_minor(name);
+-	mutex_exit(&zvol_state_lock);
+-
+-	return (SET_ERROR(error));
+-}
+-
+-/*
+  * Rename a block device minor mode for the specified volume.
+  */
+ static void
+-__zvol_rename_minor(zvol_state_t *zv, const char *newname)
++zvol_rename_minor(zvol_state_t *zv, const char *newname)
+ {
+ 	int readonly = get_disk_ro(zv->zv_disk);
+ 
+@@ -1488,30 +1433,120 @@ __zvol_rename_minor(zvol_state_t *zv, const char *newname)
+ 	set_disk_ro(zv->zv_disk, readonly);
+ }
+ 
++
++/*
++ * Mask errors to continue dmu_objset_find() traversal
++ */
++static int
++zvol_create_snap_minor_cb(const char *dsname, void *arg)
++{
++	const char *name = (const char *)arg;
++
++	/* skip the designated dataset */
++	if (name && strcmp(dsname, name) == 0)
++		return (0);
++
++	/* at this point, the dsname should name a snapshot */
++	if (strchr(dsname, '@') == 0) {
++		dprintf("zvol_create_snap_minor_cb(): "
++			"%s is not a shapshot name\n", dsname);
++	} else {
++		(void) zvol_create_minor_impl(dsname);
++	}
++
++	return (0);
++}
++
++/*
++ * Mask errors to continue dmu_objset_find() traversal
++ */
+ static int
+ zvol_create_minors_cb(const char *dsname, void *arg)
+ {
+-	(void) zvol_create_minor(dsname);
++	uint64_t snapdev;
++	int error;
++
++	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
++	if (error)
++		return (0);
++
++	/*
++	 * Given the name and the 'snapdev' property, create device minor nodes
++	 * with the linkages to zvols/snapshots as needed.
++	 * If the name represents a zvol, create a minor node for the zvol, then
++	 * check if its snapshots are 'visible', and if so, iterate over the
++	 * snapshots and create device minor nodes for those.
++	 */
++	if (strchr(dsname, '@') == 0) {
++		/* create minor for the 'dsname' explicitly */
++		error = zvol_create_minor_impl(dsname);
++		if ((error == 0 || error == EEXIST) &&
++		    (snapdev == ZFS_SNAPDEV_VISIBLE)) {
++			fstrans_cookie_t cookie = spl_fstrans_mark();
++			/*
++			 * traverse snapshots only, do not traverse children,
++			 * and skip the 'dsname'
++			 */
++			error = dmu_objset_find((char *)dsname,
++			    zvol_create_snap_minor_cb, (void *)dsname,
++			    DS_FIND_SNAPSHOTS);
++			spl_fstrans_unmark(cookie);
++		}
++	} else {
++		dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
++			dsname);
++	}
+ 
+ 	return (0);
+ }
+ 
+ /*
+- * Create minors for specified dataset including children and snapshots.
++ * Create minors for the specified dataset, including children and snapshots.
++ * Pay attention to the 'snapdev' property and iterate over the snapshots
++ * only if they are 'visible'. This approach allows one to assure that the
++ * snapshot metadata is read from disk only if it is needed.
++ *
++ * The name can represent a dataset to be recursively scanned for zvols and
++ * their snapshots, or a single zvol snapshot. If the name represents a
++ * dataset, the scan is performed in two nested stages:
++ * - scan the dataset for zvols, and
++ * - for each zvol, create a minor node, then check if the zvol's snapshots
++ *   are 'visible', and only then iterate over the snapshots if needed
++ *
++ * If the name represents a snapshot, a check is perfromed if the snapshot is
++ * 'visible' (which also verifies that the parent is a zvol), and if so,
++ * a minor node for that snapshot is created.
+  */
+-int
+-zvol_create_minors(const char *name)
++static int
++zvol_create_minors_impl(const char *name)
+ {
+ 	int error = 0;
+ 	fstrans_cookie_t cookie;
++	char *atp, *parent;
+ 
+ 	if (zvol_inhibit_dev)
+ 		return (0);
+ 
+-	cookie = spl_fstrans_mark();
+-	error = dmu_objset_find((char *)name, zvol_create_minors_cb,
+-	    NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+-	spl_fstrans_unmark(cookie);
++	parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
++	(void) strlcpy(parent, name, MAXPATHLEN);
++
++	if ((atp = strrchr(parent, '@')) != NULL) {
++		uint64_t snapdev;
++
++		*atp = '\0';
++		error = dsl_prop_get_integer(parent, "snapdev",
++		    &snapdev, NULL);
++
++		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
++			error = zvol_create_minor_impl(name);
++	} else {
++		cookie = spl_fstrans_mark();
++		error = dmu_objset_find(parent, zvol_create_minors_cb,
++		    NULL, DS_FIND_CHILDREN);
++		spl_fstrans_unmark(cookie);
++	}
++
++	kmem_free(parent, MAXPATHLEN);
+ 
+ 	return (SET_ERROR(error));
+ }
+@@ -1519,8 +1554,8 @@ zvol_create_minors(const char *name)
+ /*
+  * Remove minors for specified dataset including children and snapshots.
+  */
+-void
+-zvol_remove_minors(const char *name)
++static void
++zvol_remove_minors_impl(const char *name)
+ {
+ 	zvol_state_t *zv, *zv_next;
+ 	int namelen = ((name) ? strlen(name) : 0);
+@@ -1550,11 +1585,41 @@ zvol_remove_minors(const char *name)
+ 	mutex_exit(&zvol_state_lock);
+ }
+ 
++/* Remove minor for this specific snapshot only */
++static void
++zvol_remove_minor_impl(const char *name)
++{
++	zvol_state_t *zv, *zv_next;
++
++	if (zvol_inhibit_dev)
++		return;
++
++	if (strchr(name, '@') == NULL)
++		return;
++
++	mutex_enter(&zvol_state_lock);
++
++	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
++		zv_next = list_next(&zvol_state_list, zv);
++
++		if (strcmp(zv->zv_name, name) == 0) {
++			/* If in use, leave alone */
++			if (zv->zv_open_count > 0)
++				continue;
++			zvol_remove(zv);
++			zvol_free(zv);
++			break;
++		}
++	}
++
++	mutex_exit(&zvol_state_lock);
++}
++
+ /*
+  * Rename minors for specified dataset including children and snapshots.
+  */
+-void
+-zvol_rename_minors(const char *oldname, const char *newname)
++static void
++zvol_rename_minors_impl(const char *oldname, const char *newname)
+ {
+ 	zvol_state_t *zv, *zv_next;
+ 	int oldnamelen, newnamelen;
+@@ -1577,14 +1642,14 @@ zvol_rename_minors(const char *oldname, const char *newname)
+ 			continue;
+ 
+ 		if (strcmp(zv->zv_name, oldname) == 0) {
+-			__zvol_rename_minor(zv, newname);
++			zvol_rename_minor(zv, newname);
+ 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+ 		    (zv->zv_name[oldnamelen] == '/' ||
+ 		    zv->zv_name[oldnamelen] == '@')) {
+ 			snprintf(name, MAXNAMELEN, "%s%c%s", newname,
+ 			    zv->zv_name[oldnamelen],
+ 			    zv->zv_name + oldnamelen + 1);
+-			__zvol_rename_minor(zv, name);
++			zvol_rename_minor(zv, name);
+ 		}
+ 	}
+ 
+@@ -1593,42 +1658,227 @@ zvol_rename_minors(const char *oldname, const char *newname)
+ 	kmem_free(name, MAXNAMELEN);
+ }
+ 
++typedef struct zvol_snapdev_cb_arg {
++	uint64_t snapdev;
++} zvol_snapdev_cb_arg_t;
++
+ static int
+-snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
+-	uint64_t snapdev = *(uint64_t *) arg;
++zvol_set_snapdev_cb(const char *dsname, void *param) {
++	zvol_snapdev_cb_arg_t *arg = param;
+ 
+ 	if (strchr(dsname, '@') == NULL)
+ 		return (0);
+ 
+-	switch (snapdev) {
++	switch (arg->snapdev) {
+ 		case ZFS_SNAPDEV_VISIBLE:
+-			mutex_enter(&zvol_state_lock);
+-			(void) __zvol_create_minor(dsname, B_TRUE);
+-			mutex_exit(&zvol_state_lock);
++			(void) zvol_create_minor_impl(dsname);
+ 			break;
+ 		case ZFS_SNAPDEV_HIDDEN:
+-			(void) zvol_remove_minor(dsname);
++			(void) zvol_remove_minor_impl(dsname);
+ 			break;
+ 	}
+ 
+ 	return (0);
+ }
+ 
++static void
++zvol_set_snapdev_impl(char *name, uint64_t snapdev)
++{
++	zvol_snapdev_cb_arg_t arg = {snapdev};
++	fstrans_cookie_t cookie = spl_fstrans_mark();
++	/*
++	 * The zvol_set_snapdev_sync() sets snapdev appropriately
++	 * in the dataset hierarchy. Here, we only scan snapshots.
++	 */
++	dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
++	spl_fstrans_unmark(cookie);
++}
++
++static zvol_task_t *
++zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
++    uint64_t snapdev)
++{
++	zvol_task_t *task;
++	char *delim;
++
++	/* Never allow tasks on hidden names. */
++	if (name1[0] == '$')
++		return (NULL);
++
++	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
++	task->op = op;
++	task->snapdev = snapdev;
++	delim = strchr(name1, '/');
++	strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
++
++	strlcpy(task->name1, name1, MAXNAMELEN);
++	if (name2 != NULL)
++		strlcpy(task->name2, name2, MAXNAMELEN);
++
++	return (task);
++}
++
++static void
++zvol_task_free(zvol_task_t *task)
++{
++	kmem_free(task, sizeof (zvol_task_t));
++}
++
++/*
++ * The worker thread function performed asynchronously.
++ */
++static void
++zvol_task_cb(void *param)
++{
++	zvol_task_t *task = (zvol_task_t *)param;
++
++	switch (task->op) {
++	case ZVOL_ASYNC_CREATE_MINORS:
++		(void) zvol_create_minors_impl(task->name1);
++		break;
++	case ZVOL_ASYNC_REMOVE_MINORS:
++		zvol_remove_minors_impl(task->name1);
++		break;
++	case ZVOL_ASYNC_RENAME_MINORS:
++		zvol_rename_minors_impl(task->name1, task->name2);
++		break;
++	case ZVOL_ASYNC_SET_SNAPDEV:
++		zvol_set_snapdev_impl(task->name1, task->snapdev);
++		break;
++	default:
++		VERIFY(0);
++		break;
++	}
++
++	zvol_task_free(task);
++}
++
++typedef struct zvol_set_snapdev_arg {
++	const char *zsda_name;
++	uint64_t zsda_value;
++	zprop_source_t zsda_source;
++	dmu_tx_t *zsda_tx;
++} zvol_set_snapdev_arg_t;
++
++/*
++ * Sanity check the dataset for safe use by the sync task.  No additional
++ * conditions are imposed.
++ */
++static int
++zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
++{
++	zvol_set_snapdev_arg_t *zsda = arg;
++	dsl_pool_t *dp = dmu_tx_pool(tx);
++	dsl_dir_t *dd;
++	int error;
++
++	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
++	if (error != 0)
++		return (error);
++
++	dsl_dir_rele(dd, FTAG);
++
++	return (error);
++}
++
++static int
++zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
++{
++	zvol_set_snapdev_arg_t *zsda = arg;
++	char dsname[MAXNAMELEN];
++	zvol_task_t *task;
++
++	dsl_dataset_name(ds, dsname);
++	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
++	    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
++	    &zsda->zsda_value, zsda->zsda_tx);
++
++	task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
++	    NULL, zsda->zsda_value);
++	if (task == NULL)
++		return (0);
++
++	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
++		task, TQ_SLEEP);
++	return (0);
++}
++
++/*
++ * Traverse all child snapshot datasets and apply snapdev appropriately.
++ */
++static void
++zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
++{
++	zvol_set_snapdev_arg_t *zsda = arg;
++	dsl_pool_t *dp = dmu_tx_pool(tx);
++	dsl_dir_t *dd;
++
++	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
++	zsda->zsda_tx = tx;
++
++	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
++	    zsda, DS_FIND_CHILDREN);
++
++	dsl_dir_rele(dd, FTAG);
++}
++
+ int
+-zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
+-	fstrans_cookie_t cookie;
++zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
++{
++	zvol_set_snapdev_arg_t zsda;
+ 
+-	if (zvol_inhibit_dev)
+-		/* caller should continue to modify snapdev property */
+-		return (-1);
++	zsda.zsda_name = ddname;
++	zsda.zsda_source = source;
++	zsda.zsda_value = snapdev;
+ 
+-	cookie = spl_fstrans_mark();
+-	(void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
+-		&snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+-	spl_fstrans_unmark(cookie);
++	return (dsl_sync_task(ddname, zvol_set_snapdev_check,
++	    zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
++}
++
++void
++zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
++{
++	zvol_task_t *task;
++	taskqid_t id;
++
++	task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL);
++	if (task == NULL)
++		return;
++
++	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
++	if ((async == B_FALSE) && (id != 0))
++		taskq_wait_id(spa->spa_zvol_taskq, id);
++}
++
++void
++zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
++{
++	zvol_task_t *task;
++	taskqid_t id;
++
++	task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
++	if (task == NULL)
++		return;
+ 
+-	/* caller should continue to modify snapdev property */
+-	return (-1);
++	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
++	if ((async == B_FALSE) && (id != 0))
++		taskq_wait_id(spa->spa_zvol_taskq, id);
++}
++
++void
++zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
++    boolean_t async)
++{
++	zvol_task_t *task;
++	taskqid_t id;
++
++	task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
++	if (task == NULL)
++		return;
++
++	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
++	if ((async == B_FALSE) && (id != 0))
++		taskq_wait_id(spa->spa_zvol_taskq, id);
+ }
+ 
+ int
+@@ -1662,11 +1911,13 @@ out:
+ void
+ zvol_fini(void)
+ {
+-	zvol_remove_minors(NULL);
++	zvol_remove_minors_impl(NULL);
++
+ 	blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
+ 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
+-	mutex_destroy(&zvol_state_lock);
++
+ 	list_destroy(&zvol_state_list);
++	mutex_destroy(&zvol_state_lock);
+ }
+ 
+ module_param(zvol_inhibit_dev, uint, 0644);
+diff --git a/scripts/zconfig.sh b/scripts/zconfig.sh
+index 45ccf62..1908dc1 100755
+--- a/scripts/zconfig.sh
++++ b/scripts/zconfig.sh
+@@ -217,15 +217,26 @@ test_3() {
+ 	zconfig_zvol_device_stat 10 ${POOL_NAME} ${FULL_ZVOL_NAME} \
+ 	    ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 11
+ 
++	# Toggle the snapdev and observe snapshot device links toggled
++	${ZFS} set snapdev=hidden ${FULL_ZVOL_NAME} || fail 12
++	
++	zconfig_zvol_device_stat 7 ${POOL_NAME} ${FULL_ZVOL_NAME} \
++	    "invalid" ${FULL_CLONE_NAME} || fail 13
++
++	${ZFS} set snapdev=visible ${FULL_ZVOL_NAME} || fail 14
++
++	zconfig_zvol_device_stat 10 ${POOL_NAME} ${FULL_ZVOL_NAME} \
++	    ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 15
++
+ 	# Destroy the pool and consequently the devices
+-	${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 -d || fail 12
++	${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 -d || fail 16
+ 
+ 	# verify the devices were removed
+ 	zconfig_zvol_device_stat 0 ${POOL_NAME} ${FULL_ZVOL_NAME} \
+-	    ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 13
++	    ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 17
+ 
+-	${ZFS_SH} -u || fail 14
+-	rm -f ${TMP_CACHE} || fail 15
++	${ZFS_SH} -u || fail 18
++	rm -f ${TMP_CACHE} || fail 19
+ 
+ 	pass
+ }
+-- 
+2.1.4
+
diff --git a/zfs-patches/series b/zfs-patches/series
index 735ea2d..fe93901 100644
--- a/zfs-patches/series
+++ b/zfs-patches/series
@@ -1,2 +1,5 @@
 fix-control.patch
 fix-dh-installinit.patch
+0001-Fix-lock-order-inversion-with-zvol_open.patch
+0002-Make-zvol-minor-functionality-more-robust.patch
+0003-Add-support-for-asynchronous-zvol-minor-operations.patch
-- 
2.1.4





More information about the pve-devel mailing list