[pve-devel] [PATCH zfsonlinux] add patch with backport of a7a144e65 ("enforce arc_dnode_limit")

Stoiko Ivanov s.ivanov at proxmox.com
Mon Jul 28 18:32:18 CEST 2025


forgot to add to the subject prefix that this is targeted at the
stable-bookworm branch. - sorry for the noise

On Mon, 28 Jul 2025 18:30:41 +0200
Stoiko Ivanov <s.ivanov at proxmox.com> wrote:

> as requested and argued in:
> https://lore.proxmox.com/pve-devel/5f3e46ed-bf99-45e2-b497-fc81dc50d9b3@proxmox.com/
> 
> Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
> ---
> If accepted we'd try to include the backport upstream for 2.2.9
> ...kport-enforce-arc_dnode_limit-to-2.2.patch | 207 ++++++++++++++++++
>  debian/patches/series                         |   1 +
>  2 files changed, 208 insertions(+)
>  create mode 100644 debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch
> 
> diff --git a/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch b/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch
> new file mode 100644
> index 000000000..26c0dface
> --- /dev/null
> +++ b/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch
> @@ -0,0 +1,207 @@
> +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
> +From: Stoiko Ivanov <s.ivanov at proxmox.com>
> +Date: Mon, 28 Jul 2025 15:16:46 +0200
> +Subject: [PATCH] backport enforce arc_dnode_limit to 2.2
> +
> +This patch is a backport of  a7a144e65 ("enforce arc_dnode_limit")
> +for the 2.2 branch.
> +
> +back-ported from commit a7a144e655850b4160943e4ba315eb9a5dc2b2fe
> +working around changes from:
> +55427add3 ("Several improvements to ARC shrinking (#16197)")
> +5b9f3b766 ("Soften pruning threshold on not evictable metadata")
> +which are present in 2.2.3, but not in 2.2.8
> +
> +Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
> +---
> + include/sys/arc_impl.h           |  2 +-
> + module/os/linux/zfs/zfs_vfsops.c | 65 ++++++++++++++++++++++++++++++++
> + module/zfs/arc.c                 | 27 ++++++++-----
> + 3 files changed, 83 insertions(+), 11 deletions(-)
> +
> +diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
> +index defebe3b2fbbdc8b1c901108f19bde8f12ea2175..36cd83e83358e123980909a903854d573531d4b6 100644
> +--- a/include/sys/arc_impl.h
> ++++ b/include/sys/arc_impl.h
> +@@ -952,7 +952,7 @@ typedef struct arc_sums {
> + 	wmsum_t arcstat_data_size;
> + 	wmsum_t arcstat_metadata_size;
> + 	wmsum_t arcstat_dbuf_size;
> +-	wmsum_t arcstat_dnode_size;
> ++	aggsum_t arcstat_dnode_size;
> + 	wmsum_t arcstat_bonus_size;
> + 	wmsum_t arcstat_l2_hits;
> + 	wmsum_t arcstat_l2_misses;
> +diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
> +index 1f72cce07dd1830e2f5fdff50ef298e05be3013d..da0cda03985e93acfa111efb7d6e9d6637f729cf 100644
> +--- a/module/os/linux/zfs/zfs_vfsops.c
> ++++ b/module/os/linux/zfs/zfs_vfsops.c
> +@@ -1179,6 +1179,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
> + 	return (error);
> + }
> + 
> ++/*
> ++ * Dentry and inode caches referenced by a task in non-root memcg are
> ++ * not going to be scanned by the kernel-provided shrinker. So, if
> ++ * kernel prunes nothing, fall back to this manual walk to free dnodes.
> ++ * To avoid scanning the same znodes multiple times they are always rotated
> ++ * to the end of the z_all_znodes list. New znodes are inserted at the
> ++ * end of the list so we're always scanning the oldest znodes first.
> ++ */
> ++static int
> ++zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
> ++{
> ++	znode_t **zp_array, *zp;
> ++	int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
> ++	int objects = 0;
> ++	int i = 0, j = 0;
> ++
> ++	zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
> ++
> ++	mutex_enter(&zfsvfs->z_znodes_lock);
> ++	while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
> ++
> ++		if ((i++ > nr_to_scan) || (j >= max_array))
> ++			break;
> ++
> ++		ASSERT(list_link_active(&zp->z_link_node));
> ++		list_remove(&zfsvfs->z_all_znodes, zp);
> ++		list_insert_tail(&zfsvfs->z_all_znodes, zp);
> ++
> ++		/* Skip active znodes and .zfs entries */
> ++		if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
> ++			continue;
> ++
> ++		if (igrab(ZTOI(zp)) == NULL)
> ++			continue;
> ++
> ++		zp_array[j] = zp;
> ++		j++;
> ++	}
> ++	mutex_exit(&zfsvfs->z_znodes_lock);
> ++
> ++	for (i = 0; i < j; i++) {
> ++		zp = zp_array[i];
> ++
> ++		ASSERT3P(zp, !=, NULL);
> ++		d_prune_aliases(ZTOI(zp));
> ++
> ++		if (atomic_read(&ZTOI(zp)->i_count) == 1)
> ++			objects++;
> ++
> ++		zrele(zp);
> ++	}
> ++
> ++	vmem_free(zp_array, max_array * sizeof (znode_t *));
> ++
> ++	return (objects);
> ++}
> ++
> + /*
> +  * The ARC has requested that the filesystem drop entries from the dentry
> +  * and inode caches.  This can occur when the ARC needs to free meta data
> +@@ -1222,6 +1279,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
> + 	*objects = (*shrinker->scan_objects)(shrinker, &sc);
> + #endif
> + 
> ++	/*
> ++	 * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
> ++	 * due to dentry and inode caches being referenced by a task running
> ++	 * in non-root memcg.
> ++	 */
> ++	if (*objects == 0)
> ++		*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
> ++
> + 	zfs_exit(zfsvfs, FTAG);
> + 
> + 	dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
> +diff --git a/module/zfs/arc.c b/module/zfs/arc.c
> +index 5c6e92f0f8b31dbcd569c92e645afb2e180b2deb..383aca2808d2c0aa8d09a9cdc8cfbfde4f6a6fc9 100644
> +--- a/module/zfs/arc.c
> ++++ b/module/zfs/arc.c
> +@@ -2597,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
> + 		ARCSTAT_INCR(arcstat_bonus_size, space);
> + 		break;
> + 	case ARC_SPACE_DNODE:
> +-		ARCSTAT_INCR(arcstat_dnode_size, space);
> ++		aggsum_add(&arc_sums.arcstat_dnode_size, space);
> + 		break;
> + 	case ARC_SPACE_DBUF:
> + 		ARCSTAT_INCR(arcstat_dbuf_size, space);
> +@@ -2643,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
> + 		ARCSTAT_INCR(arcstat_bonus_size, -space);
> + 		break;
> + 	case ARC_SPACE_DNODE:
> +-		ARCSTAT_INCR(arcstat_dnode_size, -space);
> ++		aggsum_add(&arc_sums.arcstat_dnode_size, -space);
> + 		break;
> + 	case ARC_SPACE_DBUF:
> + 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
> +@@ -4292,7 +4292,7 @@ arc_evict(void)
> + 	 * target is not evictable or if they go over arc_dnode_limit.
> + 	 */
> + 	int64_t prune = 0;
> +-	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
> ++	int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
> + 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
> + 	if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
> + 	    zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) -
> +@@ -4775,12 +4775,19 @@ arc_is_overflowing(boolean_t use_reserve)
> + 	 * in the ARC. In practice, that's in the tens of MB, which is low
> + 	 * enough to be safe.
> + 	 */
> +-	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
> ++	int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) -
> + 	    arc_c - overflow / 2;
> + 	if (!use_reserve)
> + 		overflow /= 2;
> +-	return (over < 0 ? ARC_OVF_NONE :
> +-	    over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
> ++
> ++	int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
> ++	    arc_dnode_limit;
> ++
> ++	/* Always allow at least one block of overflow. */
> ++	if (arc_over < 0 && dn_over <= 0)
> ++		return (ARC_OVF_NONE);
> ++
> ++	return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
> + }
> + 
> + static abd_t *
> +@@ -6938,7 +6945,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
> + #if defined(COMPAT_FREEBSD11)
> + 	as->arcstat_other_size.value.ui64 =
> + 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
> +-	    wmsum_value(&arc_sums.arcstat_dnode_size) +
> ++	    aggsum_value(&arc_sums.arcstat_dnode_size) +
> + 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
> + #endif
> + 
> +@@ -6980,7 +6987,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
> + 	    &as->arcstat_uncached_evictable_metadata);
> + 
> + 	as->arcstat_dnode_size.value.ui64 =
> +-	    wmsum_value(&arc_sums.arcstat_dnode_size);
> ++	    aggsum_value(&arc_sums.arcstat_dnode_size);
> + 	as->arcstat_bonus_size.value.ui64 =
> + 	    wmsum_value(&arc_sums.arcstat_bonus_size);
> + 	as->arcstat_l2_hits.value.ui64 =
> +@@ -7349,7 +7356,7 @@ arc_state_init(void)
> + 	wmsum_init(&arc_sums.arcstat_data_size, 0);
> + 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
> + 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
> +-	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
> ++	aggsum_init(&arc_sums.arcstat_dnode_size, 0);
> + 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
> + 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
> + 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
> +@@ -7507,7 +7514,7 @@ arc_state_fini(void)
> + 	wmsum_fini(&arc_sums.arcstat_data_size);
> + 	wmsum_fini(&arc_sums.arcstat_metadata_size);
> + 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
> +-	wmsum_fini(&arc_sums.arcstat_dnode_size);
> ++	aggsum_fini(&arc_sums.arcstat_dnode_size);
> + 	wmsum_fini(&arc_sums.arcstat_bonus_size);
> + 	wmsum_fini(&arc_sums.arcstat_l2_hits);
> + 	wmsum_fini(&arc_sums.arcstat_l2_misses);
> diff --git a/debian/patches/series b/debian/patches/series
> index 229027ff9..11a97debd 100644
> --- a/debian/patches/series
> +++ b/debian/patches/series
> @@ -9,3 +9,4 @@
>  0009-arc-stat-summary-guard-access-to-freshly-introduced-.patch
>  0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
>  0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
> +0012-backport-enforce-arc_dnode_limit-to-2.2.patch





More information about the pve-devel mailing list