[pve-devel] [PATCH zfsonlinux 1/2] update zfs submodule to 2.2.4 and refresh patches
Stoiko Ivanov
s.ivanov at proxmox.com
Tue May 7 15:38:35 CEST 2024
mostly - drop all patches we had queued up to get kernel 6.8
supported.
Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
---
...md-unit-for-importing-specific-pools.patch | 4 +-
...-move-manpage-arcstat-1-to-arcstat-8.patch | 2 +-
...-guard-access-to-l2arc-MFU-MRU-stats.patch | 12 +-
...hten-bounds-for-noalloc-stat-availab.patch | 4 +-
...rectly-handle-partition-16-and-later.patch | 52 --
...-use-splice_copy_file_range-for-fall.patch | 135 ----
.../0014-linux-5.4-compat-page_size.patch | 121 ----
.../patches/0015-abd-add-page-iterator.patch | 334 ---------
...-existing-functions-to-vdev_classic_.patch | 349 ---------
...v_disk-reorganise-vdev_disk_io_start.patch | 111 ---
...-read-write-IO-function-configurable.patch | 69 --
...e-BIO-filling-machinery-to-avoid-spl.patch | 671 ------------------
...dule-parameter-to-select-BIO-submiss.patch | 104 ---
...se-bio_chain-to-submit-multiple-BIOs.patch | 363 ----------
...on-t-use-compound-heads-on-Linux-4.5.patch | 96 ---
...ault-to-classic-submission-for-2.2.x.patch | 90 ---
...ion-caused-by-mmap-flushing-problems.patch | 104 ---
...touch-vbio-after-its-handed-off-to-t.patch | 57 --
debian/patches/series | 14 -
upstream | 2 +-
20 files changed, 12 insertions(+), 2682 deletions(-)
delete mode 100644 debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch
delete mode 100644 debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
delete mode 100644 debian/patches/0014-linux-5.4-compat-page_size.patch
delete mode 100644 debian/patches/0015-abd-add-page-iterator.patch
delete mode 100644 debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
delete mode 100644 debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch
delete mode 100644 debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch
delete mode 100644 debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
delete mode 100644 debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
delete mode 100644 debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
delete mode 100644 debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
delete mode 100644 debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
delete mode 100644 debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
delete mode 100644 debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
diff --git a/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch b/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch
index 8232978c..0600296f 100644
--- a/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch
+++ b/debian/patches/0007-Add-systemd-unit-for-importing-specific-pools.patch
@@ -18,7 +18,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
etc/Makefile.am | 1 +
etc/systemd/system/50-zfs.preset | 1 +
- etc/systemd/system/zfs-import at .service.in | 18 ++++++++++++++++
+ etc/systemd/system/zfs-import at .service.in | 18 ++++++++++++++++++
3 files changed, 20 insertions(+)
create mode 100644 etc/systemd/system/zfs-import at .service.in
@@ -48,7 +48,7 @@ index e4056a92c..030611419 100644
enable zfs-share.service
diff --git a/etc/systemd/system/zfs-import at .service.in b/etc/systemd/system/zfs-import at .service.in
new file mode 100644
-index 000000000..9b4ee9371
+index 000000000..5bd19fb79
--- /dev/null
+++ b/etc/systemd/system/zfs-import at .service.in
@@ -0,0 +1,18 @@
diff --git a/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch b/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
index c11c1ae8..9a4aea56 100644
--- a/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
+++ b/debian/patches/0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
@@ -15,7 +15,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
rename man/{man1/arcstat.1 => man8/arcstat.8} (99%)
diff --git a/man/Makefile.am b/man/Makefile.am
-index 45156571e..3713e9371 100644
+index 43bb014dd..a9293468a 100644
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -2,7 +2,6 @@ dist_noinst_man_MANS = \
diff --git a/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch b/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
index f8cb3539..2e7c207d 100644
--- a/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
+++ b/debian/patches/0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
@@ -27,7 +27,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
2 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/cmd/arc_summary b/cmd/arc_summary
-index 9c69ec4f8..edf94ea2a 100755
+index 100fb1987..86b2260a1 100755
--- a/cmd/arc_summary
+++ b/cmd/arc_summary
@@ -655,13 +655,13 @@ def section_arc(kstats_dict):
@@ -48,7 +48,7 @@ index 9c69ec4f8..edf94ea2a 100755
prt_i1('L2 ineligible evictions:',
f_bytes(arc_stats['evict_l2_ineligible']))
print()
-@@ -851,20 +851,20 @@ def section_l2arc(kstats_dict):
+@@ -860,20 +860,20 @@ def section_l2arc(kstats_dict):
f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
f_bytes(arc_stats['l2_hdr_size']))
prt_i2('MFU allocated size:',
@@ -80,10 +80,10 @@ index 9c69ec4f8..edf94ea2a 100755
print()
prt_1('L2ARC breakdown:', f_hits(l2_access_total))
diff --git a/cmd/arcstat.in b/cmd/arcstat.in
-index 8df1c62f7..833348d0e 100755
+index c4f10a1d6..c570dca88 100755
--- a/cmd/arcstat.in
+++ b/cmd/arcstat.in
-@@ -565,8 +565,8 @@ def calculate():
+@@ -597,8 +597,8 @@ def calculate():
v["el2skip"] = d["evict_l2_skip"] // sint
v["el2cach"] = d["evict_l2_cached"] // sint
v["el2el"] = d["evict_l2_eligible"] // sint
@@ -93,8 +93,8 @@ index 8df1c62f7..833348d0e 100755
+ v["el2mru"] = d.get("evict_l2_eligible_mru", 0) // sint
v["el2inel"] = d["evict_l2_ineligible"] // sint
v["mtxmis"] = d["mutex_miss"] // sint
-
-@@ -581,11 +581,11 @@ def calculate():
+ v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] +
+@@ -624,11 +624,11 @@ def calculate():
v["l2size"] = cur["l2_size"]
v["l2bytes"] = d["l2_read_bytes"] // sint
diff --git a/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch b/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
index 3c87b0cb..29c7f9ab 100644
--- a/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
+++ b/debian/patches/0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
@@ -51,10 +51,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
-index 69bf9649a..fd42ce7c1 100644
+index ed0b8d7a1..f3acc49d0 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
-@@ -2616,7 +2616,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
+@@ -2663,7 +2663,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
if (vs->vs_scan_removing != 0) {
(void) printf(gettext(" (removing)"));
diff --git a/debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch b/debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch
deleted file mode 100644
index 578b74bd..00000000
--- a/debian/patches/0012-udev-correctly-handle-partition-16-and-later.patch
+++ /dev/null
@@ -1,52 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler at proxmox.com>
-Date: Wed, 6 Mar 2024 10:39:06 +0100
-Subject: [PATCH] udev: correctly handle partition #16 and later
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-If a zvol has more than 15 partitions, the minor device number exhausts
-the slot count reserved for partitions next to the zvol itself. As a
-result, the minor number cannot be used to determine the partition
-number for the higher partition, and doing so results in wrong named
-symlinks being generated by udev.
-
-Since the partition number is encoded in the block device name anyway,
-let's just extract it from there instead.
-
-Fixes: #15904
-
-Signed-off-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
-Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
----
- udev/zvol_id.c | 9 +++++----
- 1 file changed, 5 insertions(+), 4 deletions(-)
-
-diff --git a/udev/zvol_id.c b/udev/zvol_id.c
-index 5960b9787..609349594 100644
---- a/udev/zvol_id.c
-+++ b/udev/zvol_id.c
-@@ -51,7 +51,7 @@ const char *__asan_default_options(void) {
- int
- main(int argc, const char *const *argv)
- {
-- if (argc != 2) {
-+ if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) {
- fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]);
- return (1);
- }
-@@ -72,9 +72,10 @@ main(int argc, const char *const *argv)
- return (1);
- }
-
-- unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS;
-- if (dev_part != 0)
-- sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part);
-+ const char *dev_part = strrchr(dev_name, 'p');
-+ if (dev_part != NULL) {
-+ sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1);
-+ }
-
- for (size_t i = 0; i < strlen(zvol_name); ++i)
- if (isblank(zvol_name[i]))
diff --git a/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch b/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
deleted file mode 100644
index 380d77c9..00000000
--- a/debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
+++ /dev/null
@@ -1,135 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob N <robn at despairlabs.com>
-Date: Thu, 21 Mar 2024 10:46:15 +1100
-Subject: [PATCH] Linux 6.8 compat: use splice_copy_file_range() for fallback
-
-Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
-simple wrapper around splice_copy_file_range(). Detect that function
-directly and use it if generic_ is not available.
-
-Sponsored-by: https://despairlabs.com/sponsor/
-Reviewed-by: Tony Hutter <hutter2 at llnl.gov>
-Reviewed by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <robn at despairlabs.com>
-Closes #15930
-Closes #15931
-(cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0)
----
- config/kernel-vfs-file_range.m4 | 27 +++++++++++++++++++++++++++
- config/kernel.m4 | 2 ++
- module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
- 3 files changed, 43 insertions(+), 2 deletions(-)
-
-diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
-index cc96404d8..8a5cbe2ee 100644
---- a/config/kernel-vfs-file_range.m4
-+++ b/config/kernel-vfs-file_range.m4
-@@ -16,6 +16,9 @@ dnl #
- dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
- dnl # generic_copy_file_range() added to support it
- dnl #
-+dnl # 6.8: generic_copy_file_range() removed, replaced by
-+dnl # splice_copy_file_range()
-+dnl #
- AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
- ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
- #include <linux/fs.h>
-@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
- ])
- ])
-
-+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
-+ ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
-+ #include <linux/splice.h>
-+ ], [
-+ struct file *src_file __attribute__ ((unused)) = NULL;
-+ loff_t src_off __attribute__ ((unused)) = 0;
-+ struct file *dst_file __attribute__ ((unused)) = NULL;
-+ loff_t dst_off __attribute__ ((unused)) = 0;
-+ size_t len __attribute__ ((unused)) = 0;
-+ splice_copy_file_range(src_file, src_off, dst_file, dst_off,
-+ len);
-+ ])
-+])
-+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
-+ AC_MSG_CHECKING([whether splice_copy_file_range() is available])
-+ ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
-+ AC_MSG_RESULT(yes)
-+ AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
-+ [splice_copy_file_range() is available])
-+ ],[
-+ AC_MSG_RESULT(no)
-+ ])
-+])
-+
- AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
- ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
- #include <linux/fs.h>
-diff --git a/config/kernel.m4 b/config/kernel.m4
-index e3f864577..1d0c5a27f 100644
---- a/config/kernel.m4
-+++ b/config/kernel.m4
-@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
- ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
- ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
- ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
-+ ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
- ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
- ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
- ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
-@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
- ZFS_AC_KERNEL_VFS_IOV_ITER
- ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
- ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
-+ ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
- ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
- ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
- ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
-diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
-index 3065d54fa..64728fdb1 100644
---- a/module/os/linux/zfs/zpl_file_range.c
-+++ b/module/os/linux/zfs/zpl_file_range.c
-@@ -26,6 +26,9 @@
- #include <linux/compat.h>
- #endif
- #include <linux/fs.h>
-+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
-+#include <linux/splice.h>
-+#endif
- #include <sys/file.h>
- #include <sys/zfs_znode.h>
- #include <sys/zfs_vnops.h>
-@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
- ret = zpl_clone_file_range_impl(src_file, src_off,
- dst_file, dst_off, len);
-
--#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
-+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
- /*
- * Since Linux 5.3 the filesystem driver is responsible for executing
- * an appropriate fallback, and a generic fallback function is provided.
-@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
- ret == -EAGAIN)
- ret = generic_copy_file_range(src_file, src_off, dst_file,
- dst_off, len, flags);
-+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
-+ /*
-+ * Since 6.8 the fallback function is called splice_copy_file_range
-+ * and has a slightly different signature.
-+ */
-+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
-+ ret == -EAGAIN)
-+ ret = splice_copy_file_range(src_file, src_off, dst_file,
-+ dst_off, len);
- #else
- /*
- * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
-@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
- */
- if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
- ret = -EOPNOTSUPP;
--#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
-+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
-
- return (ret);
- }
diff --git a/debian/patches/0014-linux-5.4-compat-page_size.patch b/debian/patches/0014-linux-5.4-compat-page_size.patch
deleted file mode 100644
index 258c025d..00000000
--- a/debian/patches/0014-linux-5.4-compat-page_size.patch
+++ /dev/null
@@ -1,121 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Mon, 13 Nov 2023 17:55:29 +1100
-Subject: [PATCH] linux 5.4 compat: page_size()
-
-Before 5.4 we have to do a little math.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21)
----
- config/kernel-mm-page-size.m4 | 17 +++++++++++
- config/kernel.m4 | 2 ++
- include/os/linux/Makefile.am | 1 +
- include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
- 4 files changed, 56 insertions(+)
- create mode 100644 config/kernel-mm-page-size.m4
- create mode 100644 include/os/linux/kernel/linux/mm_compat.h
-
-diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
-new file mode 100644
-index 000000000..d5ebd9269
---- /dev/null
-+++ b/config/kernel-mm-page-size.m4
-@@ -0,0 +1,17 @@
-+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
-+ ZFS_LINUX_TEST_SRC([page_size], [
-+ #include <linux/mm.h>
-+ ],[
-+ unsigned long s;
-+ s = page_size(NULL);
-+ ])
-+])
-+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
-+ AC_MSG_CHECKING([whether page_size() is available])
-+ ZFS_LINUX_TEST_RESULT([page_size], [
-+ AC_MSG_RESULT(yes)
-+ AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
-+ ],[
-+ AC_MSG_RESULT(no)
-+ ])
-+])
-diff --git a/config/kernel.m4 b/config/kernel.m4
-index 1d0c5a27f..548905ccd 100644
---- a/config/kernel.m4
-+++ b/config/kernel.m4
-@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
- ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
- ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
- ZFS_AC_KERNEL_SRC_SYNC_BDEV
-+ ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
- case "$host_cpu" in
- powerpc*)
- ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
-@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
- ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
- ZFS_AC_KERNEL_COPY_SPLICE_READ
- ZFS_AC_KERNEL_SYNC_BDEV
-+ ZFS_AC_KERNEL_MM_PAGE_SIZE
- case "$host_cpu" in
- powerpc*)
- ZFS_AC_KERNEL_CPU_HAS_FEATURE
-diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
-index 3830d198d..51c27132b 100644
---- a/include/os/linux/Makefile.am
-+++ b/include/os/linux/Makefile.am
-@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
- %D%/kernel/linux/compiler_compat.h \
- %D%/kernel/linux/dcache_compat.h \
- %D%/kernel/linux/kmap_compat.h \
-+ %D%/kernel/linux/mm_compat.h \
- %D%/kernel/linux/mod_compat.h \
- %D%/kernel/linux/page_compat.h \
- %D%/kernel/linux/percpu_compat.h \
-diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
-new file mode 100644
-index 000000000..40056c68d
---- /dev/null
-+++ b/include/os/linux/kernel/linux/mm_compat.h
-@@ -0,0 +1,36 @@
-+/*
-+ * CDDL HEADER START
-+ *
-+ * The contents of this file are subject to the terms of the
-+ * Common Development and Distribution License (the "License").
-+ * You may not use this file except in compliance with the License.
-+ *
-+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-+ * or https://opensource.org/licenses/CDDL-1.0.
-+ * See the License for the specific language governing permissions
-+ * and limitations under the License.
-+ *
-+ * When distributing Covered Code, include this CDDL HEADER in each
-+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-+ * If applicable, add the following below this CDDL HEADER, with the
-+ * fields enclosed by brackets "[]" replaced with your own identifying
-+ * information: Portions Copyright [yyyy] [name of copyright owner]
-+ *
-+ * CDDL HEADER END
-+ */
-+
-+/*
-+ * Copyright (c) 2023, 2024, Klara Inc.
-+ */
-+
-+#ifndef _ZFS_MM_COMPAT_H
-+#define _ZFS_MM_COMPAT_H
-+
-+#include <linux/mm.h>
-+
-+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
-+#ifndef HAVE_MM_PAGE_SIZE
-+#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
-+#endif
-+
-+#endif /* _ZFS_MM_COMPAT_H */
diff --git a/debian/patches/0015-abd-add-page-iterator.patch b/debian/patches/0015-abd-add-page-iterator.patch
deleted file mode 100644
index bb91ea32..00000000
--- a/debian/patches/0015-abd-add-page-iterator.patch
+++ /dev/null
@@ -1,334 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Mon, 11 Dec 2023 16:05:54 +1100
-Subject: [PATCH] abd: add page iterator
-
-The regular ABD iterators yield data buffers, so they have to map and
-unmap pages into kernel memory. If the caller only wants to count
-chunks, or can use page pointers directly, then the map/unmap is just
-unnecessary overhead.
-
-This adds adb_iterate_page_func, which yields unmapped struct page
-instead.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
----
- include/sys/abd.h | 7 +++
- include/sys/abd_impl.h | 26 ++++++++-
- module/os/freebsd/zfs/abd_os.c | 4 +-
- module/os/linux/zfs/abd_os.c | 104 ++++++++++++++++++++++++++++++---
- module/zfs/abd.c | 42 +++++++++++++
- 5 files changed, 169 insertions(+), 14 deletions(-)
-
-diff --git a/include/sys/abd.h b/include/sys/abd.h
-index 750f9986c..8a2df0bca 100644
---- a/include/sys/abd.h
-+++ b/include/sys/abd.h
-@@ -79,6 +79,9 @@ typedef struct abd {
-
- typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
- typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
-+#if defined(__linux__) && defined(_KERNEL)
-+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
-+#endif
-
- extern int zfs_abd_scatter_enabled;
-
-@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
- int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
- int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
- abd_iter_func2_t *, void *);
-+#if defined(__linux__) && defined(_KERNEL)
-+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
-+ void *);
-+#endif
- void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
- void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
- void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
-diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
-index 40546d4af..f88ea25e2 100644
---- a/include/sys/abd_impl.h
-+++ b/include/sys/abd_impl.h
-@@ -21,6 +21,7 @@
- /*
- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
-+ * Copyright (c) 2023, 2024, Klara Inc.
- */
-
- #ifndef _ABD_IMPL_H
-@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
- ABDSTAT_DECR /* Decrease abdstat values */
- } abd_stats_op_t;
-
--struct scatterlist; /* forward declaration */
-+/* forward declarations */
-+struct scatterlist;
-+struct page;
-
- struct abd_iter {
- /* public interface */
-- void *iter_mapaddr; /* addr corresponding to iter_pos */
-- size_t iter_mapsize; /* length of data valid at mapaddr */
-+ union {
-+ /* for abd_iter_map()/abd_iter_unmap() */
-+ struct {
-+ /* addr corresponding to iter_pos */
-+ void *iter_mapaddr;
-+ /* length of data valid at mapaddr */
-+ size_t iter_mapsize;
-+ };
-+ /* for abd_iter_page() */
-+ struct {
-+ /* current page */
-+ struct page *iter_page;
-+ /* offset of data in page */
-+ size_t iter_page_doff;
-+ /* size of data in page */
-+ size_t iter_page_dsize;
-+ };
-+ };
-
- /* private */
- abd_t *iter_abd; /* ABD being iterated through */
-@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
- void abd_iter_advance(struct abd_iter *, size_t);
- void abd_iter_map(struct abd_iter *);
- void abd_iter_unmap(struct abd_iter *);
-+void abd_iter_page(struct abd_iter *);
-
- /*
- * Helper macros
-diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
-index 58a37df62..3b812271f 100644
---- a/module/os/freebsd/zfs/abd_os.c
-+++ b/module/os/freebsd/zfs/abd_os.c
-@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
- {
- ASSERT(!abd_is_gang(abd));
- abd_verify(abd);
-+ memset(aiter, 0, sizeof (struct abd_iter));
- aiter->iter_abd = abd;
-- aiter->iter_pos = 0;
-- aiter->iter_mapaddr = NULL;
-- aiter->iter_mapsize = 0;
- }
-
- /*
-diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
-index 24390fbbf..dae128012 100644
---- a/module/os/linux/zfs/abd_os.c
-+++ b/module/os/linux/zfs/abd_os.c
-@@ -21,6 +21,7 @@
- /*
- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2019 by Delphix. All rights reserved.
-+ * Copyright (c) 2023, 2024, Klara Inc.
- */
-
- /*
-@@ -59,6 +60,7 @@
- #include <sys/zfs_znode.h>
- #ifdef _KERNEL
- #include <linux/kmap_compat.h>
-+#include <linux/mm_compat.h>
- #include <linux/scatterlist.h>
- #endif
-
-@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
- {
- ASSERT(!abd_is_gang(abd));
- abd_verify(abd);
-+ memset(aiter, 0, sizeof (struct abd_iter));
- aiter->iter_abd = abd;
-- aiter->iter_mapaddr = NULL;
-- aiter->iter_mapsize = 0;
-- aiter->iter_pos = 0;
-- if (abd_is_linear(abd)) {
-- aiter->iter_offset = 0;
-- aiter->iter_sg = NULL;
-- } else {
-+ if (!abd_is_linear(abd)) {
- aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
- aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
- }
-@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
- boolean_t
- abd_iter_at_end(struct abd_iter *aiter)
- {
-+ ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
- return (aiter->iter_pos == aiter->iter_abd->abd_size);
- }
-
-@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
- void
- abd_iter_advance(struct abd_iter *aiter, size_t amount)
- {
-+ /*
-+ * Ensure that last chunk is not in use. abd_iterate_*() must clear
-+ * this state (directly or abd_iter_unmap()) before advancing.
-+ */
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
- ASSERT0(aiter->iter_mapsize);
-+ ASSERT3P(aiter->iter_page, ==, NULL);
-+ ASSERT0(aiter->iter_page_doff);
-+ ASSERT0(aiter->iter_page_dsize);
-
- /* There's nothing left to advance to, so do nothing */
- if (abd_iter_at_end(aiter))
-@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
- }
-
- #if defined(_KERNEL)
-+/*
-+ * Yield the next page struct and data offset and size within it, without
-+ * mapping it into the address space.
-+ */
-+void
-+abd_iter_page(struct abd_iter *aiter)
-+{
-+ if (abd_iter_at_end(aiter)) {
-+ aiter->iter_page = NULL;
-+ aiter->iter_page_doff = 0;
-+ aiter->iter_page_dsize = 0;
-+ return;
-+ }
-+
-+ struct page *page;
-+ size_t doff, dsize;
-+
-+ if (abd_is_linear(aiter->iter_abd)) {
-+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
-+
-+ /* memory address at iter_pos */
-+ void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
-+
-+ /* struct page for address */
-+ page = is_vmalloc_addr(paddr) ?
-+ vmalloc_to_page(paddr) : virt_to_page(paddr);
-+
-+ /* offset of address within the page */
-+ doff = offset_in_page(paddr);
-+
-+ /* total data remaining in abd from this position */
-+ dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
-+ } else {
-+ ASSERT(!abd_is_gang(aiter->iter_abd));
-+
-+ /* current scatter page */
-+ page = sg_page(aiter->iter_sg);
-+
-+ /* position within page */
-+ doff = aiter->iter_offset;
-+
-+ /* remaining data in scatterlist */
-+ dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
-+ aiter->iter_abd->abd_size - aiter->iter_pos);
-+ }
-+ ASSERT(page);
-+
-+ if (PageTail(page)) {
-+ /*
-+ * This page is part of a "compound page", which is a group of
-+ * pages that can be referenced from a single struct page *.
-+ * Its organised as a "head" page, followed by a series of
-+ * "tail" pages.
-+ *
-+ * In OpenZFS, compound pages are allocated using the
-+ * __GFP_COMP flag, which we get from scatter ABDs and SPL
-+ * vmalloc slabs (ie >16K allocations). So a great many of the
-+ * IO buffers we get are going to be of this type.
-+ *
-+ * The tail pages are just regular PAGE_SIZE pages, and can be
-+ * safely used as-is. However, the head page has length
-+ * covering itself and all the tail pages. If this ABD chunk
-+ * spans multiple pages, then we can use the head page and a
-+ * >PAGE_SIZE length, which is far more efficient.
-+ *
-+ * To do this, we need to adjust the offset to be counted from
-+ * the head page. struct page for compound pages are stored
-+ * contiguously, so we can just adjust by a simple offset.
-+ */
-+ struct page *head = compound_head(page);
-+ doff += ((page - head) * PAGESIZE);
-+ page = head;
-+ }
-+
-+ /* final page and position within it */
-+ aiter->iter_page = page;
-+ aiter->iter_page_doff = doff;
-+
-+ /* amount of data in the chunk, up to the end of the page */
-+ aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
-+}
-+
- /*
- * bio_nr_pages for ABD.
- * @off is the offset in @abd
-@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
- module_param(zfs_abd_scatter_max_order, uint, 0644);
- MODULE_PARM_DESC(zfs_abd_scatter_max_order,
- "Maximum order allocation used for a scatter ABD.");
--#endif
-+
-+#endif /* _KERNEL */
-diff --git a/module/zfs/abd.c b/module/zfs/abd.c
-index d982f201c..3388e2357 100644
---- a/module/zfs/abd.c
-+++ b/module/zfs/abd.c
-@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
- return (ret);
- }
-
-+#if defined(__linux__) && defined(_KERNEL)
-+int
-+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
-+ abd_iter_page_func_t *func, void *private)
-+{
-+ struct abd_iter aiter;
-+ int ret = 0;
-+
-+ if (size == 0)
-+ return (0);
-+
-+ abd_verify(abd);
-+ ASSERT3U(off + size, <=, abd->abd_size);
-+
-+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
-+
-+ while (size > 0) {
-+ IMPLY(abd_is_gang(abd), c_abd != NULL);
-+
-+ abd_iter_page(&aiter);
-+
-+ size_t len = MIN(aiter.iter_page_dsize, size);
-+ ASSERT3U(len, >, 0);
-+
-+ ret = func(aiter.iter_page, aiter.iter_page_doff,
-+ len, private);
-+
-+ aiter.iter_page = NULL;
-+ aiter.iter_page_doff = 0;
-+ aiter.iter_page_dsize = 0;
-+
-+ if (ret != 0)
-+ break;
-+
-+ size -= len;
-+ c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
-+ }
-+
-+ return (ret);
-+}
-+#endif
-+
- struct buf_arg {
- void *arg_buf;
- };
diff --git a/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch b/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
deleted file mode 100644
index ebabb1c8..00000000
--- a/debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
+++ /dev/null
@@ -1,349 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Tue, 9 Jan 2024 12:12:56 +1100
-Subject: [PATCH] vdev_disk: rename existing functions to vdev_classic_*
-
-This is just renaming the existing functions we're about to replace and
-grouping them together to make the next commits easier to follow.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4)
----
- include/sys/abd.h | 2 +
- module/os/linux/zfs/abd_os.c | 5 +
- module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
- 3 files changed, 120 insertions(+), 102 deletions(-)
-
-diff --git a/include/sys/abd.h b/include/sys/abd.h
-index 8a2df0bca..bee38b831 100644
---- a/include/sys/abd.h
-+++ b/include/sys/abd.h
-@@ -220,6 +220,8 @@ void abd_fini(void);
-
- /*
- * Linux ABD bio functions
-+ * Note: these are only needed to support vdev_classic. See comment in
-+ * vdev_disk.c.
- */
- #if defined(__linux__) && defined(_KERNEL)
- unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
-diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
-index dae128012..3fe01c0b7 100644
---- a/module/os/linux/zfs/abd_os.c
-+++ b/module/os/linux/zfs/abd_os.c
-@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
- aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
- }
-
-+/*
-+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
-+ * vdev_disk.c.
-+ */
-+
- /*
- * bio_nr_pages for ABD.
- * @off is the offset in @abd
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index b0bda5fa2..957619b87 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
- */
- #define EFI_MIN_RESV_SIZE (16 * 1024)
-
--/*
-- * Virtual device vector for disks.
-- */
--typedef struct dio_request {
-- zio_t *dr_zio; /* Parent ZIO */
-- atomic_t dr_ref; /* References */
-- int dr_error; /* Bio error */
-- int dr_bio_count; /* Count of bio's */
-- struct bio *dr_bio[]; /* Attached bio's */
--} dio_request_t;
--
- /*
- * BIO request failfast mask.
- */
-@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
- v->vdev_tsd = NULL;
- }
-
--static dio_request_t *
--vdev_disk_dio_alloc(int bio_count)
--{
-- dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
-- sizeof (struct bio *) * bio_count, KM_SLEEP);
-- atomic_set(&dr->dr_ref, 0);
-- dr->dr_bio_count = bio_count;
-- dr->dr_error = 0;
--
-- for (int i = 0; i < dr->dr_bio_count; i++)
-- dr->dr_bio[i] = NULL;
--
-- return (dr);
--}
--
--static void
--vdev_disk_dio_free(dio_request_t *dr)
--{
-- int i;
--
-- for (i = 0; i < dr->dr_bio_count; i++)
-- if (dr->dr_bio[i])
-- bio_put(dr->dr_bio[i]);
--
-- kmem_free(dr, sizeof (dio_request_t) +
-- sizeof (struct bio *) * dr->dr_bio_count);
--}
--
--static void
--vdev_disk_dio_get(dio_request_t *dr)
--{
-- atomic_inc(&dr->dr_ref);
--}
--
--static void
--vdev_disk_dio_put(dio_request_t *dr)
--{
-- int rc = atomic_dec_return(&dr->dr_ref);
--
-- /*
-- * Free the dio_request when the last reference is dropped and
-- * ensure zio_interpret is called only once with the correct zio
-- */
-- if (rc == 0) {
-- zio_t *zio = dr->dr_zio;
-- int error = dr->dr_error;
--
-- vdev_disk_dio_free(dr);
--
-- if (zio) {
-- zio->io_error = error;
-- ASSERT3S(zio->io_error, >=, 0);
-- if (zio->io_error)
-- vdev_disk_error(zio);
--
-- zio_delay_interrupt(zio);
-- }
-- }
--}
--
--BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
--{
-- dio_request_t *dr = bio->bi_private;
--
-- if (dr->dr_error == 0) {
--#ifdef HAVE_1ARG_BIO_END_IO_T
-- dr->dr_error = BIO_END_IO_ERROR(bio);
--#else
-- if (error)
-- dr->dr_error = -(error);
-- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-- dr->dr_error = EIO;
--#endif
-- }
--
-- /* Drop reference acquired by __vdev_disk_physio */
-- vdev_disk_dio_put(dr);
--}
--
- static inline void
- vdev_submit_bio_impl(struct bio *bio)
- {
-@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
- return (bio);
- }
-
-+/* ========== */
-+
-+/*
-+ * This is the classic, battle-tested BIO submission code.
-+ *
-+ * These functions have been renamed to vdev_classic_* to make it clear what
-+ * they belong to, but their implementations are unchanged.
-+ */
-+
-+/*
-+ * Virtual device vector for disks.
-+ */
-+typedef struct dio_request {
-+ zio_t *dr_zio; /* Parent ZIO */
-+ atomic_t dr_ref; /* References */
-+ int dr_error; /* Bio error */
-+ int dr_bio_count; /* Count of bio's */
-+ struct bio *dr_bio[]; /* Attached bio's */
-+} dio_request_t;
-+
-+static dio_request_t *
-+vdev_classic_dio_alloc(int bio_count)
-+{
-+ dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
-+ sizeof (struct bio *) * bio_count, KM_SLEEP);
-+ atomic_set(&dr->dr_ref, 0);
-+ dr->dr_bio_count = bio_count;
-+ dr->dr_error = 0;
-+
-+ for (int i = 0; i < dr->dr_bio_count; i++)
-+ dr->dr_bio[i] = NULL;
-+
-+ return (dr);
-+}
-+
-+static void
-+vdev_classic_dio_free(dio_request_t *dr)
-+{
-+ int i;
-+
-+ for (i = 0; i < dr->dr_bio_count; i++)
-+ if (dr->dr_bio[i])
-+ bio_put(dr->dr_bio[i]);
-+
-+ kmem_free(dr, sizeof (dio_request_t) +
-+ sizeof (struct bio *) * dr->dr_bio_count);
-+}
-+
-+static void
-+vdev_classic_dio_get(dio_request_t *dr)
-+{
-+ atomic_inc(&dr->dr_ref);
-+}
-+
-+static void
-+vdev_classic_dio_put(dio_request_t *dr)
-+{
-+ int rc = atomic_dec_return(&dr->dr_ref);
-+
-+ /*
-+ * Free the dio_request when the last reference is dropped and
-+ * ensure zio_interpret is called only once with the correct zio
-+ */
-+ if (rc == 0) {
-+ zio_t *zio = dr->dr_zio;
-+ int error = dr->dr_error;
-+
-+ vdev_classic_dio_free(dr);
-+
-+ if (zio) {
-+ zio->io_error = error;
-+ ASSERT3S(zio->io_error, >=, 0);
-+ if (zio->io_error)
-+ vdev_disk_error(zio);
-+
-+ zio_delay_interrupt(zio);
-+ }
-+ }
-+}
-+
-+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
-+{
-+ dio_request_t *dr = bio->bi_private;
-+
-+ if (dr->dr_error == 0) {
-+#ifdef HAVE_1ARG_BIO_END_IO_T
-+ dr->dr_error = BIO_END_IO_ERROR(bio);
-+#else
-+ if (error)
-+ dr->dr_error = -(error);
-+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-+ dr->dr_error = EIO;
-+#endif
-+ }
-+
-+ /* Drop reference acquired by vdev_classic_physio */
-+ vdev_classic_dio_put(dr);
-+}
-+
- static inline unsigned int
--vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
-+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
- {
- unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
- bio_size, abd_offset);
-@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
- }
-
- static int
--__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
-+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
- size_t io_size, uint64_t io_offset, int rw, int flags)
- {
- dio_request_t *dr;
-@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
- }
-
- retry:
-- dr = vdev_disk_dio_alloc(bio_count);
-+ dr = vdev_classic_dio_alloc(bio_count);
-
- if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
- zio->io_vd->vdev_failfast == B_TRUE) {
-@@ -771,23 +780,23 @@ retry:
- * this should be rare - see the comment above.
- */
- if (dr->dr_bio_count == i) {
-- vdev_disk_dio_free(dr);
-+ vdev_classic_dio_free(dr);
- bio_count *= 2;
- goto retry;
- }
-
-- nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
-+ nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
- dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
- if (unlikely(dr->dr_bio[i] == NULL)) {
-- vdev_disk_dio_free(dr);
-+ vdev_classic_dio_free(dr);
- return (SET_ERROR(ENOMEM));
- }
-
-- /* Matching put called by vdev_disk_physio_completion */
-- vdev_disk_dio_get(dr);
-+ /* Matching put called by vdev_classic_physio_completion */
-+ vdev_classic_dio_get(dr);
-
- BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
-- dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
-+ dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
- dr->dr_bio[i]->bi_private = dr;
- bio_set_op_attrs(dr->dr_bio[i], rw, flags);
-
-@@ -801,7 +810,7 @@ retry:
- }
-
- /* Extra reference to protect dio_request during vdev_submit_bio */
-- vdev_disk_dio_get(dr);
-+ vdev_classic_dio_get(dr);
-
- if (dr->dr_bio_count > 1)
- blk_start_plug(&plug);
-@@ -815,11 +824,13 @@ retry:
- if (dr->dr_bio_count > 1)
- blk_finish_plug(&plug);
-
-- vdev_disk_dio_put(dr);
-+ vdev_classic_dio_put(dr);
-
- return (error);
- }
-
-+/* ========== */
-+
- BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
- {
- zio_t *zio = bio->bi_private;
-@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
- }
-
- zio->io_target_timestamp = zio_handle_io_delay(zio);
-- error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
-+ error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
- zio->io_size, zio->io_offset, rw, 0);
- rw_exit(&vd->vd_lock);
-
diff --git a/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch b/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch
deleted file mode 100644
index 23a946fc..00000000
--- a/debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch
+++ /dev/null
@@ -1,111 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Tue, 9 Jan 2024 12:23:30 +1100
-Subject: [PATCH] vdev_disk: reorganise vdev_disk_io_start
-
-Light reshuffle to make it a bit more linear to read and get rid of a
-bunch of args that aren't needed in all cases.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6)
----
- module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
- 1 file changed, 31 insertions(+), 20 deletions(-)
-
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index 957619b87..51e7cef2f 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
- }
-
- static int
--vdev_classic_physio(struct block_device *bdev, zio_t *zio,
-- size_t io_size, uint64_t io_offset, int rw, int flags)
-+vdev_classic_physio(zio_t *zio)
- {
-+ vdev_t *v = zio->io_vd;
-+ vdev_disk_t *vd = v->vdev_tsd;
-+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
-+ size_t io_size = zio->io_size;
-+ uint64_t io_offset = zio->io_offset;
-+ int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
-+ int flags = 0;
-+
- dio_request_t *dr;
- uint64_t abd_offset;
- uint64_t bio_offset;
-@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
- {
- vdev_t *v = zio->io_vd;
- vdev_disk_t *vd = v->vdev_tsd;
-- int rw, error;
-+ int error;
-
- /*
- * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
-@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
- rw_exit(&vd->vd_lock);
- zio_execute(zio);
- return;
-- case ZIO_TYPE_WRITE:
-- rw = WRITE;
-- break;
--
-- case ZIO_TYPE_READ:
-- rw = READ;
-- break;
-
- case ZIO_TYPE_TRIM:
- zio->io_error = vdev_disk_io_trim(zio);
-@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
- #endif
- return;
-
-- default:
-+ case ZIO_TYPE_READ:
-+ case ZIO_TYPE_WRITE:
-+ zio->io_target_timestamp = zio_handle_io_delay(zio);
-+ error = vdev_classic_physio(zio);
- rw_exit(&vd->vd_lock);
-- zio->io_error = SET_ERROR(ENOTSUP);
-- zio_interrupt(zio);
-+ if (error) {
-+ zio->io_error = error;
-+ zio_interrupt(zio);
-+ }
- return;
-- }
-
-- zio->io_target_timestamp = zio_handle_io_delay(zio);
-- error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
-- zio->io_size, zio->io_offset, rw, 0);
-- rw_exit(&vd->vd_lock);
-+ default:
-+ /*
-+ * Getting here means our parent vdev has made a very strange
-+ * request of us, and shouldn't happen. Assert here to force a
-+ * crash in dev builds, but in production return the IO
-+ * unhandled. The pool will likely suspend anyway but that's
-+ * nicer than crashing the kernel.
-+ */
-+ ASSERT3S(zio->io_type, ==, -1);
-
-- if (error) {
-- zio->io_error = error;
-+ rw_exit(&vd->vd_lock);
-+ zio->io_error = SET_ERROR(ENOTSUP);
- zio_interrupt(zio);
- return;
- }
-+
-+ __builtin_unreachable();
- }
-
- static void
diff --git a/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch b/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch
deleted file mode 100644
index a169979c..00000000
--- a/debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Tue, 9 Jan 2024 12:29:19 +1100
-Subject: [PATCH] vdev_disk: make read/write IO function configurable
-
-This is just setting up for the next couple of commits, which will add a
-new IO function and a parameter to select it.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0)
----
- module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
- 1 file changed, 21 insertions(+), 2 deletions(-)
-
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index 51e7cef2f..de4dba72f 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
- #endif
- }
-
-+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
-+
- static void
- vdev_disk_io_start(zio_t *zio)
- {
-@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
- case ZIO_TYPE_READ:
- case ZIO_TYPE_WRITE:
- zio->io_target_timestamp = zio_handle_io_delay(zio);
-- error = vdev_classic_physio(zio);
-+ error = vdev_disk_io_rw_fn(zio);
- rw_exit(&vd->vd_lock);
- if (error) {
- zio->io_error = error;
-@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
- /* XXX: Implement me as a vnode rele for the device */
- }
-
-+/*
-+ * At first use vdev use, set the submission function from the default value if
-+ * it hasn't been set already.
-+ */
-+static int
-+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
-+{
-+ (void) spa;
-+ (void) nv;
-+ (void) tsd;
-+
-+ if (vdev_disk_io_rw_fn == NULL)
-+ vdev_disk_io_rw_fn = vdev_classic_physio;
-+
-+ return (0);
-+}
-+
- vdev_ops_t vdev_disk_ops = {
-- .vdev_op_init = NULL,
-+ .vdev_op_init = vdev_disk_init,
- .vdev_op_fini = NULL,
- .vdev_op_open = vdev_disk_open,
- .vdev_op_close = vdev_disk_close,
diff --git a/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch b/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
deleted file mode 100644
index 8ccbf655..00000000
--- a/debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
+++ /dev/null
@@ -1,671 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Tue, 18 Jul 2023 11:11:29 +1000
-Subject: [PATCH] vdev_disk: rewrite BIO filling machinery to avoid split pages
-
-This commit tackles a number of issues in the way BIOs (`struct bio`)
-are constructed for submission to the Linux block layer.
-
-The kernel has a hard upper limit on the number of pages/segments that
-can be added to a BIO, as well as a separate limit for each device
-(related to its queue depth and other scheduling characteristics).
-
-ZFS counts the number of memory pages in the request ABD
-(`abd_nr_pages_off()`, and then uses that as the number of segments to
-put into the BIO, up to the hard upper limit. If it requires more than
-the limit, it will create multiple BIOs.
-
-Leaving aside the fact that page count method is wrong (see below), not
-limiting to the device segment max means that the device driver will
-need to split the BIO in half. This is alone is not necessarily a
-problem, but it interacts with another issue to cause a much larger
-problem.
-
-The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
-`struct page` pointer, and offset+len within it. `struct page` can
-represent a run of contiguous memory pages (known as a "compound page").
-In can be of arbitrary length.
-
-The ZFS functions that count ABD pages and load them into the BIO
-(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
-consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
-page` is for multiple pages. In this case, it will load the same `struct
-page` into the BIO multiple times, with the offset adjusted each time.
-
-With a sufficiently large ABD, this can easily lead to the BIO being
-entirely filled much earlier than it could have been. This is also
-further contributes to the problem caused by the incorrect segment limit
-calculation, as its much easier to go past the device limit, and so
-require a split.
-
-Again, this is not a problem on its own.
-
-The logic for "never submit more than `PAGE_SIZE`" is actually a little
-more subtle. It will actually never submit a buffer that crosses a 4K
-page boundary.
-
-In practice, this is fine, as most ABDs are scattered, that is a list of
-complete 4K pages, and so are loaded in as such.
-
-Linear ABDs are typically allocated from slabs, and for small sizes they
-are frequently not aligned to page boundaries. For example, a 12K
-allocation can span four pages, eg:
-
- -- 4K -- -- 4K -- -- 4K -- -- 4K --
- | | | | |
- :## ######## ######## ######: [1K, 4K, 4K, 3K]
-
-Such an allocation would be loaded into a BIO as you see:
-
- [1K, 4K, 4K, 3K]
-
-This tends not to be a problem in practice, because even if the BIO were
-filled and needed to be split, each half would still have either a start
-or end aligned to the logical block size of the device (assuming 4K at
-least).
-
----
-
-In ideal circumstances, these shortcomings don't cause any particular
-problems. Its when they start to interact with other ZFS features that
-things get interesting.
-
-Aggregation will create a "gang" ABD, which is simply a list of other
-ABDs. Iterating over a gang ABD is just iterating over each ABD within
-it in turn.
-
-Because the segments are simply loaded in order, we can end up with
-uneven segments either side of the "gap" between the two ABDs. For
-example, two 12K ABDs might be aggregated and then loaded as:
-
- [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]
-
-Should a split occur, each individual BIO can end up either having an
-start or end offset that is not aligned to the logical block size, which
-some drivers (eg SCSI) will reject. However, this tends not to happen
-because the default aggregation limit usually keeps the BIO small enough
-to not require more than one split, and most pages are actually full 4K
-pages, so hitting an uneven gap is very rare anyway.
-
-If the pool is under particular memory pressure, then an IO can be
-broken down into a "gang block", a 512-byte block composed of a header
-and up to three block pointers. Each points to a fragment of the
-original write, or in turn, another gang block, breaking the original
-data up over and over until space can be found in the pool for each of
-them.
-
-Each gang header is a separate 512-byte memory allocation from a slab,
-that needs to be written down to disk. When the gang header is added to
-the BIO, its a single 512-byte segment.
-
-Pulling all this together, consider a large aggregated write of gang
-blocks. This results a BIO containing lots of 512-byte segments. Given
-our tendency to overfill the BIO, a split is likely, and most possible
-split points will yield a pair of BIOs that are misaligned. Drivers that
-care, like the SCSI driver, will reject them.
-
----
-
-This commit is a substantial refactor and rewrite of much of `vdev_disk`
-to sort all this out.
-
-`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
-if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
-override this, to assist with testing.
-
-We scan the ABD up front to count the number of pages within it, and to
-confirm that if we submitted all those pages to one or more BIOs, it
-could be split at any point with creating a misaligned BIO. If the
-pages in the BIO are not usable (as in any of the above situations), the
-ABD is linearised, and then checked again. This is the same technique
-used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
-and allocator quirks.
-
-`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
-idea is simply that it can hold all the state needed to create, submit
-and return multiple BIOs, including all the refcounts, the ABD copy if
-it was needed, and so on. Apart from what I hope is a clearer interface,
-the major difference is that because we know how many BIOs we'll need up
-front, we don't need the old overflow logic that would grow the BIO
-array, throw away all the old work and restart. We can get it right from
-the start.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8)
----
- include/os/linux/kernel/linux/mod_compat.h | 1 +
- man/man4/zfs.4 | 10 +-
- module/os/linux/zfs/vdev_disk.c | 439 ++++++++++++++++++++-
- 3 files changed, 447 insertions(+), 3 deletions(-)
-
-diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
-index 8e20a9613..039865b70 100644
---- a/include/os/linux/kernel/linux/mod_compat.h
-+++ b/include/os/linux/kernel/linux/mod_compat.h
-@@ -68,6 +68,7 @@ enum scope_prefix_types {
- zfs_trim,
- zfs_txg,
- zfs_vdev,
-+ zfs_vdev_disk,
- zfs_vdev_file,
- zfs_vdev_mirror,
- zfs_vnops,
-diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
-index 352990e02..b5679f2f0 100644
---- a/man/man4/zfs.4
-+++ b/man/man4/zfs.4
-@@ -2,6 +2,7 @@
- .\" Copyright (c) 2013 by Turbo Fredriksson <turbo at bayour.com>. All rights reserved.
- .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
- .\" Copyright (c) 2019 Datto Inc.
-+.\" Copyright (c) 2023, 2024 Klara, Inc.
- .\" The contents of this file are subject to the terms of the Common Development
- .\" and Distribution License (the "License"). You may not use this file except
- .\" in compliance with the License. You can obtain a copy of the license at
-@@ -15,7 +16,7 @@
- .\" own identifying information:
- .\" Portions Copyright [yyyy] [name of copyright owner]
- .\"
--.Dd July 21, 2023
-+.Dd January 9, 2024
- .Dt ZFS 4
- .Os
- .
-@@ -1345,6 +1346,13 @@ _
- 4 Driver No driver retries on driver errors.
- .TE
- .
-+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
-+Maximum number of segments to add to a BIO (min 4).
-+If this is higher than the maximum allowed by the device queue or the kernel
-+itself, it will be clamped.
-+Setting it to zero will cause the kernel's ideal size to be used.
-+This parameter only applies on Linux.
-+.
- .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
- Time before expiring
- .Pa .zfs/snapshot .
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index de4dba72f..0ccb9ad96 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -24,6 +24,7 @@
- * Rewritten for Linux by Brian Behlendorf <behlendorf1 at llnl.gov>.
- * LLNL-CODE-403049.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
-+ * Copyright (c) 2023, 2024, Klara Inc.
- */
-
- #include <sys/zfs_context.h>
-@@ -66,6 +67,13 @@ typedef struct vdev_disk {
- krwlock_t vd_lock;
- } vdev_disk_t;
-
-+/*
-+ * Maximum number of segments to add to a bio (min 4). If this is higher than
-+ * the maximum allowed by the device queue or the kernel itself, it will be
-+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
-+ */
-+uint_t zfs_vdev_disk_max_segs = 0;
-+
- /*
- * Unique identifier for the exclusive vdev holder.
- */
-@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
- return (bio);
- }
-
-+static inline uint_t
-+vdev_bio_max_segs(struct block_device *bdev)
-+{
-+ /*
-+ * Smallest of the device max segs and the tuneable max segs. Minimum
-+ * 4, so there's room to finish split pages if they come up.
-+ */
-+ const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
-+ const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
-+ MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
-+ const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
-+
-+#ifdef HAVE_BIO_MAX_SEGS
-+ return (bio_max_segs(max_segs));
-+#else
-+ return (MIN(max_segs, BIO_MAX_PAGES));
-+#endif
-+}
-+
-+static inline uint_t
-+vdev_bio_max_bytes(struct block_device *bdev)
-+{
-+ return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
-+}
-+
-+
-+/*
-+ * Virtual block IO object (VBIO)
-+ *
-+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
-+ * they can hold. Depending on how they're allocated and structured, a large
-+ * ZIO can require more than one BIO to be submitted to the kernel, which then
-+ * all have to complete before we can return the completed ZIO back to ZFS.
-+ *
-+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
-+ * translate a ZIO down into the kernel block layer and back again.
-+ *
-+ * Note that these are only used for data ZIOs (read/write). Meta-operations
-+ * (flush/trim) don't need multiple BIOs and so can just make the call
-+ * directly.
-+ */
-+typedef struct {
-+ zio_t *vbio_zio; /* parent zio */
-+
-+ struct block_device *vbio_bdev; /* blockdev to submit bios to */
-+
-+ abd_t *vbio_abd; /* abd carrying borrowed linear buf */
-+
-+ atomic_t vbio_ref; /* bio refcount */
-+ int vbio_error; /* error from failed bio */
-+
-+ uint_t vbio_max_segs; /* max segs per bio */
-+
-+ uint_t vbio_max_bytes; /* max bytes per bio */
-+ uint_t vbio_lbs_mask; /* logical block size mask */
-+
-+ uint64_t vbio_offset; /* start offset of next bio */
-+
-+ struct bio *vbio_bio; /* pointer to the current bio */
-+ struct bio *vbio_bios; /* list of all bios */
-+} vbio_t;
-+
-+static vbio_t *
-+vbio_alloc(zio_t *zio, struct block_device *bdev)
-+{
-+ vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
-+
-+ vbio->vbio_zio = zio;
-+ vbio->vbio_bdev = bdev;
-+ atomic_set(&vbio->vbio_ref, 0);
-+ vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
-+ vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
-+ vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
-+ vbio->vbio_offset = zio->io_offset;
-+
-+ return (vbio);
-+}
-+
-+static int
-+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
-+{
-+ struct bio *bio;
-+ uint_t ssize;
-+
-+ while (size > 0) {
-+ bio = vbio->vbio_bio;
-+ if (bio == NULL) {
-+ /* New BIO, allocate and set up */
-+ bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
-+ vbio->vbio_max_segs);
-+ if (unlikely(bio == NULL))
-+ return (SET_ERROR(ENOMEM));
-+ BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
-+
-+ bio->bi_next = vbio->vbio_bios;
-+ vbio->vbio_bios = vbio->vbio_bio = bio;
-+ }
-+
-+ /*
-+ * Only load as much of the current page data as will fit in
-+ * the space left in the BIO, respecting lbs alignment. Older
-+ * kernels will error if we try to overfill the BIO, while
-+ * newer ones will accept it and split the BIO. This ensures
-+ * everything works on older kernels, and avoids an additional
-+ * overhead on the new.
-+ */
-+ ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
-+ vbio->vbio_lbs_mask);
-+ if (ssize > 0 &&
-+ bio_add_page(bio, page, ssize, offset) == ssize) {
-+ /* Accepted, adjust and load any remaining. */
-+ size -= ssize;
-+ offset += ssize;
-+ continue;
-+ }
-+
-+ /* No room, set up for a new BIO and loop */
-+ vbio->vbio_offset += BIO_BI_SIZE(bio);
-+
-+ /* Signal new BIO allocation wanted */
-+ vbio->vbio_bio = NULL;
-+ }
-+
-+ return (0);
-+}
-+
-+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
-+static void vbio_put(vbio_t *vbio);
-+
-+static void
-+vbio_submit(vbio_t *vbio, int flags)
-+{
-+ ASSERT(vbio->vbio_bios);
-+ struct bio *bio = vbio->vbio_bios;
-+ vbio->vbio_bio = vbio->vbio_bios = NULL;
-+
-+ /*
-+ * We take a reference for each BIO as we submit it, plus one to
-+ * protect us from BIOs completing before we're done submitting them
-+ * all, causing vbio_put() to free vbio out from under us and/or the
-+ * zio to be returned before all its IO has completed.
-+ */
-+ atomic_set(&vbio->vbio_ref, 1);
-+
-+ /*
-+ * If we're submitting more than one BIO, inform the block layer so
-+ * it can batch them if it wants.
-+ */
-+ struct blk_plug plug;
-+ boolean_t do_plug = (bio->bi_next != NULL);
-+ if (do_plug)
-+ blk_start_plug(&plug);
-+
-+ /* Submit all the BIOs */
-+ while (bio != NULL) {
-+ atomic_inc(&vbio->vbio_ref);
-+
-+ struct bio *next = bio->bi_next;
-+ bio->bi_next = NULL;
-+
-+ bio->bi_end_io = vdev_disk_io_rw_completion;
-+ bio->bi_private = vbio;
-+ bio_set_op_attrs(bio,
-+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
-+ WRITE : READ, flags);
-+
-+ vdev_submit_bio(bio);
-+
-+ bio = next;
-+ }
-+
-+ /* Finish the batch */
-+ if (do_plug)
-+ blk_finish_plug(&plug);
-+
-+ /* Release the extra reference */
-+ vbio_put(vbio);
-+}
-+
-+static void
-+vbio_return_abd(vbio_t *vbio)
-+{
-+ zio_t *zio = vbio->vbio_zio;
-+ if (vbio->vbio_abd == NULL)
-+ return;
-+
-+ /*
-+ * If we copied the ABD before issuing it, clean up and return the copy
-+ * to the ADB, with changes if appropriate.
-+ */
-+ void *buf = abd_to_buf(vbio->vbio_abd);
-+ abd_free(vbio->vbio_abd);
-+ vbio->vbio_abd = NULL;
-+
-+ if (zio->io_type == ZIO_TYPE_READ)
-+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-+ else
-+ abd_return_buf(zio->io_abd, buf, zio->io_size);
-+}
-+
-+static void
-+vbio_free(vbio_t *vbio)
-+{
-+ VERIFY0(atomic_read(&vbio->vbio_ref));
-+
-+ vbio_return_abd(vbio);
-+
-+ kmem_free(vbio, sizeof (vbio_t));
-+}
-+
-+static void
-+vbio_put(vbio_t *vbio)
-+{
-+ if (atomic_dec_return(&vbio->vbio_ref) > 0)
-+ return;
-+
-+ /*
-+ * This was the last reference, so the entire IO is completed. Clean
-+ * up and submit it for processing.
-+ */
-+
-+ /*
-+ * Get any data buf back to the original ABD, if necessary. We do this
-+ * now so we can get the ZIO into the pipeline as quickly as possible,
-+ * and then do the remaining cleanup after.
-+ */
-+ vbio_return_abd(vbio);
-+
-+ zio_t *zio = vbio->vbio_zio;
-+
-+ /*
-+ * Set the overall error. If multiple BIOs returned an error, only the
-+ * first will be taken; the others are dropped (see
-+ * vdev_disk_io_rw_completion()). Its pretty much impossible for
-+ * multiple IOs to the same device to fail with different errors, so
-+ * there's no real risk.
-+ */
-+ zio->io_error = vbio->vbio_error;
-+ if (zio->io_error)
-+ vdev_disk_error(zio);
-+
-+ /* All done, submit for processing */
-+ zio_delay_interrupt(zio);
-+
-+ /* Finish cleanup */
-+ vbio_free(vbio);
-+}
-+
-+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
-+{
-+ vbio_t *vbio = bio->bi_private;
-+
-+ if (vbio->vbio_error == 0) {
-+#ifdef HAVE_1ARG_BIO_END_IO_T
-+ vbio->vbio_error = BIO_END_IO_ERROR(bio);
-+#else
-+ if (error)
-+ vbio->vbio_error = -(error);
-+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-+ vbio->vbio_error = EIO;
-+#endif
-+ }
-+
-+ /*
-+ * Destroy the BIO. This is safe to do; the vbio owns its data and the
-+ * kernel won't touch it again after the completion function runs.
-+ */
-+ bio_put(bio);
-+
-+ /* Drop this BIOs reference acquired by vbio_submit() */
-+ vbio_put(vbio);
-+}
-+
-+/*
-+ * Iterator callback to count ABD pages and check their size & alignment.
-+ *
-+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
-+ * the data within that page. A page can be arbitrarily large ("compound"
-+ * pages) but we still have to ensure the data portion is correctly sized and
-+ * aligned to the logical block size, to ensure that if the kernel wants to
-+ * split the BIO, the two halves will still be properly aligned.
-+ */
-+typedef struct {
-+ uint_t bmask;
-+ uint_t npages;
-+ uint_t end;
-+} vdev_disk_check_pages_t;
-+
-+static int
-+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
-+{
-+ vdev_disk_check_pages_t *s = priv;
-+
-+ /*
-+ * If we didn't finish on a block size boundary last time, then there
-+ * would be a gap if we tried to use this ABD as-is, so abort.
-+ */
-+ if (s->end != 0)
-+ return (1);
-+
-+ /*
-+ * Note if we're taking less than a full block, so we can check it
-+ * above on the next call.
-+ */
-+ s->end = len & s->bmask;
-+
-+ /* All blocks after the first must start on a block size boundary. */
-+ if (s->npages != 0 && (off & s->bmask) != 0)
-+ return (1);
-+
-+ s->npages++;
-+ return (0);
-+}
-+
-+/*
-+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
-+ * the number of pages, or 0 if it can't be submitted like this.
-+ */
-+static boolean_t
-+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
-+{
-+ vdev_disk_check_pages_t s = {
-+ .bmask = bdev_logical_block_size(bdev)-1,
-+ .npages = 0,
-+ .end = 0,
-+ };
-+
-+ if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
-+ return (B_FALSE);
-+
-+ return (B_TRUE);
-+}
-+
-+/* Iterator callback to submit ABD pages to the vbio. */
-+static int
-+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
-+{
-+ vbio_t *vbio = priv;
-+ return (vbio_add_page(vbio, page, len, off));
-+}
-+
-+static int
-+vdev_disk_io_rw(zio_t *zio)
-+{
-+ vdev_t *v = zio->io_vd;
-+ vdev_disk_t *vd = v->vdev_tsd;
-+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
-+ int flags = 0;
-+
-+ /*
-+ * Accessing outside the block device is never allowed.
-+ */
-+ if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
-+ vdev_dbgmsg(zio->io_vd,
-+ "Illegal access %llu size %llu, device size %llu",
-+ (u_longlong_t)zio->io_offset,
-+ (u_longlong_t)zio->io_size,
-+ (u_longlong_t)i_size_read(bdev->bd_inode));
-+ return (SET_ERROR(EIO));
-+ }
-+
-+ if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
-+ v->vdev_failfast == B_TRUE) {
-+ bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
-+ zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
-+ }
-+
-+ /*
-+ * Check alignment of the incoming ABD. If any part of it would require
-+ * submitting a page that is not aligned to the logical block size,
-+ * then we take a copy into a linear buffer and submit that instead.
-+ * This should be impossible on a 512b LBS, and fairly rare on 4K,
-+ * usually requiring abnormally-small data blocks (eg gang blocks)
-+ * mixed into the same ABD as larger ones (eg aggregated).
-+ */
-+ abd_t *abd = zio->io_abd;
-+ if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
-+ void *buf;
-+ if (zio->io_type == ZIO_TYPE_READ)
-+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
-+ else
-+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-+
-+ /*
-+ * Wrap the copy in an abd_t, so we can use the same iterators
-+ * to count and fill the vbio later.
-+ */
-+ abd = abd_get_from_buf(buf, zio->io_size);
-+
-+ /*
-+ * False here would mean the borrowed copy has an invalid
-+ * alignment too, which would mean we've somehow been passed a
-+ * linear ABD with an interior page that has a non-zero offset
-+ * or a size not a multiple of PAGE_SIZE. This is not possible.
-+ * It would mean either zio_buf_alloc() or its underlying
-+ * allocators have done something extremely strange, or our
-+ * math in vdev_disk_check_pages() is wrong. In either case,
-+ * something in seriously wrong and its not safe to continue.
-+ */
-+ VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
-+ }
-+
-+ /* Allocate vbio, with a pointer to the borrowed ABD if necessary */
-+ int error = 0;
-+ vbio_t *vbio = vbio_alloc(zio, bdev);
-+ if (abd != zio->io_abd)
-+ vbio->vbio_abd = abd;
-+
-+ /* Fill it with pages */
-+ error = abd_iterate_page_func(abd, 0, zio->io_size,
-+ vdev_disk_fill_vbio_cb, vbio);
-+ if (error != 0) {
-+ vbio_free(vbio);
-+ return (error);
-+ }
-+
-+ vbio_submit(vbio, flags);
-+ return (0);
-+}
-+
- /* ========== */
-
- /*
-- * This is the classic, battle-tested BIO submission code.
-+ * This is the classic, battle-tested BIO submission code. Until we're totally
-+ * sure that the new code is safe and correct in all cases, this will remain
-+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
-+ * load time.
- *
- * These functions have been renamed to vdev_classic_* to make it clear what
- * they belong to, but their implementations are unchanged.
-@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
- (void) tsd;
-
- if (vdev_disk_io_rw_fn == NULL)
-- vdev_disk_io_rw_fn = vdev_classic_physio;
-+ /* XXX make configurable */
-+ vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
-
- return (0);
- }
-@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
-
- ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
- "Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
-+
-+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
-+ "Maximum number of data segments to add to an IO request (min 4)");
diff --git a/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch b/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
deleted file mode 100644
index b7aef38e..00000000
--- a/debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
+++ /dev/null
@@ -1,104 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Tue, 9 Jan 2024 13:28:57 +1100
-Subject: [PATCH] vdev_disk: add module parameter to select BIO submission
- method
-
-This makes the submission method selectable at module load time via the
-`zfs_vdev_disk_classic` parameter, allowing this change to be backported
-to 2.2 safely, and disabled in favour of the "classic" submission method
-if new problems come up.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7)
----
- man/man4/zfs.4 | 16 ++++++++++++++++
- module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
- 2 files changed, 45 insertions(+), 2 deletions(-)
-
-diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
-index b5679f2f0..6a628e7f3 100644
---- a/man/man4/zfs.4
-+++ b/man/man4/zfs.4
-@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
- itself, it will be clamped.
- Setting it to zero will cause the kernel's ideal size to be used.
- This parameter only applies on Linux.
-+This parameter is ignored if
-+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
-+.
-+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
-+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
-+and earlier.
-+This "classic" method has known issues with highly fragmented IO requests and
-+is slower on many workloads, but it has been in use for many years and is known
-+to be very stable.
-+If you set this parameter, please also open a bug report why you did so,
-+including the workload involved and any error messages.
-+.Pp
-+This parameter and the classic submission method will be removed once we have
-+total confidence in the new method.
-+.Pp
-+This parameter only applies on Linux, and can only be set at module load time.
- .
- .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
- Time before expiring
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index 0ccb9ad96..a9110623a 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
- /* XXX: Implement me as a vnode rele for the device */
- }
-
-+/*
-+ * BIO submission method. See comment above about vdev_classic.
-+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
-+ */
-+static uint_t zfs_vdev_disk_classic = 0; /* default new */
-+
-+/* Set submission function from module parameter */
-+static int
-+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
-+{
-+ int err = param_set_uint(buf, kp);
-+ if (err < 0)
-+ return (SET_ERROR(err));
-+
-+ vdev_disk_io_rw_fn =
-+ zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
-+
-+ printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
-+ zfs_vdev_disk_classic ? "classic" : "new");
-+
-+ return (0);
-+}
-+
- /*
- * At first use vdev use, set the submission function from the default value if
- * it hasn't been set already.
-@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
- (void) tsd;
-
- if (vdev_disk_io_rw_fn == NULL)
-- /* XXX make configurable */
-- vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
-+ vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
-+ vdev_classic_physio : vdev_disk_io_rw;
-
- return (0);
- }
-@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
-
- ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
- "Maximum number of data segments to add to an IO request (min 4)");
-+
-+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
-+ vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
-+ "Use classic BIO submission method");
diff --git a/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch b/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
deleted file mode 100644
index 2dbf8916..00000000
--- a/debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
+++ /dev/null
@@ -1,363 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Wed, 21 Feb 2024 11:07:21 +1100
-Subject: [PATCH] vdev_disk: use bio_chain() to submit multiple BIOs
-
-Simplifies our code a lot, so we don't have to wait for each and
-reassemble them.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3)
----
- module/os/linux/zfs/vdev_disk.c | 231 +++++++++++---------------------
- 1 file changed, 80 insertions(+), 151 deletions(-)
-
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index a9110623a..36468fc21 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
- if (v->vdev_reopening || vd == NULL)
- return;
-
-- if (vd->vd_bdh != NULL) {
-+ if (vd->vd_bdh != NULL)
- vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
- zfs_vdev_holder);
-- }
-
- rw_destroy(&vd->vd_lock);
- kmem_free(vd, sizeof (vdev_disk_t));
-@@ -663,9 +662,6 @@ typedef struct {
-
- abd_t *vbio_abd; /* abd carrying borrowed linear buf */
-
-- atomic_t vbio_ref; /* bio refcount */
-- int vbio_error; /* error from failed bio */
--
- uint_t vbio_max_segs; /* max segs per bio */
-
- uint_t vbio_max_bytes; /* max bytes per bio */
-@@ -674,43 +670,52 @@ typedef struct {
- uint64_t vbio_offset; /* start offset of next bio */
-
- struct bio *vbio_bio; /* pointer to the current bio */
-- struct bio *vbio_bios; /* list of all bios */
-+ int vbio_flags; /* bio flags */
- } vbio_t;
-
- static vbio_t *
--vbio_alloc(zio_t *zio, struct block_device *bdev)
-+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
- {
- vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
-
- vbio->vbio_zio = zio;
- vbio->vbio_bdev = bdev;
-- atomic_set(&vbio->vbio_ref, 0);
-+ vbio->vbio_abd = NULL;
- vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
- vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
- vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
- vbio->vbio_offset = zio->io_offset;
-+ vbio->vbio_bio = NULL;
-+ vbio->vbio_flags = flags;
-
- return (vbio);
- }
-
-+BIO_END_IO_PROTO(vbio_completion, bio, error);
-+
- static int
- vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
- {
-- struct bio *bio;
-+ struct bio *bio = vbio->vbio_bio;
- uint_t ssize;
-
- while (size > 0) {
-- bio = vbio->vbio_bio;
- if (bio == NULL) {
- /* New BIO, allocate and set up */
- bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
- vbio->vbio_max_segs);
-- if (unlikely(bio == NULL))
-- return (SET_ERROR(ENOMEM));
-+ VERIFY(bio);
-+
- BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
-+ bio_set_op_attrs(bio,
-+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
-+ WRITE : READ, vbio->vbio_flags);
-
-- bio->bi_next = vbio->vbio_bios;
-- vbio->vbio_bios = vbio->vbio_bio = bio;
-+ if (vbio->vbio_bio) {
-+ bio_chain(vbio->vbio_bio, bio);
-+ vdev_submit_bio(vbio->vbio_bio);
-+ }
-+ vbio->vbio_bio = bio;
- }
-
- /*
-@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
- vbio->vbio_offset += BIO_BI_SIZE(bio);
-
- /* Signal new BIO allocation wanted */
-- vbio->vbio_bio = NULL;
-+ bio = NULL;
- }
-
- return (0);
- }
-
--BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
--static void vbio_put(vbio_t *vbio);
-+/* Iterator callback to submit ABD pages to the vbio. */
-+static int
-+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
-+{
-+ vbio_t *vbio = priv;
-+ return (vbio_add_page(vbio, page, len, off));
-+}
-
-+/* Create some BIOs, fill them with data and submit them */
- static void
--vbio_submit(vbio_t *vbio, int flags)
-+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
- {
-- ASSERT(vbio->vbio_bios);
-- struct bio *bio = vbio->vbio_bios;
-- vbio->vbio_bio = vbio->vbio_bios = NULL;
--
-- /*
-- * We take a reference for each BIO as we submit it, plus one to
-- * protect us from BIOs completing before we're done submitting them
-- * all, causing vbio_put() to free vbio out from under us and/or the
-- * zio to be returned before all its IO has completed.
-- */
-- atomic_set(&vbio->vbio_ref, 1);
-+ ASSERT(vbio->vbio_bdev);
-
- /*
-- * If we're submitting more than one BIO, inform the block layer so
-- * it can batch them if it wants.
-+ * We plug so we can submit the BIOs as we go and only unplug them when
-+ * they are fully created and submitted. This is important; if we don't
-+ * plug, then the kernel may start executing earlier BIOs while we're
-+ * still creating and executing later ones, and if the device goes
-+ * away while that's happening, older kernels can get confused and
-+ * trample memory.
- */
- struct blk_plug plug;
-- boolean_t do_plug = (bio->bi_next != NULL);
-- if (do_plug)
-- blk_start_plug(&plug);
-+ blk_start_plug(&plug);
-
-- /* Submit all the BIOs */
-- while (bio != NULL) {
-- atomic_inc(&vbio->vbio_ref);
-+ (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
-+ ASSERT(vbio->vbio_bio);
-
-- struct bio *next = bio->bi_next;
-- bio->bi_next = NULL;
-+ vbio->vbio_bio->bi_end_io = vbio_completion;
-+ vbio->vbio_bio->bi_private = vbio;
-
-- bio->bi_end_io = vdev_disk_io_rw_completion;
-- bio->bi_private = vbio;
-- bio_set_op_attrs(bio,
-- vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
-- WRITE : READ, flags);
-+ vdev_submit_bio(vbio->vbio_bio);
-
-- vdev_submit_bio(bio);
--
-- bio = next;
-- }
--
-- /* Finish the batch */
-- if (do_plug)
-- blk_finish_plug(&plug);
-+ blk_finish_plug(&plug);
-
-- /* Release the extra reference */
-- vbio_put(vbio);
-+ vbio->vbio_bio = NULL;
-+ vbio->vbio_bdev = NULL;
- }
-
--static void
--vbio_return_abd(vbio_t *vbio)
-+/* IO completion callback */
-+BIO_END_IO_PROTO(vbio_completion, bio, error)
- {
-+ vbio_t *vbio = bio->bi_private;
- zio_t *zio = vbio->vbio_zio;
-- if (vbio->vbio_abd == NULL)
-- return;
--
-- /*
-- * If we copied the ABD before issuing it, clean up and return the copy
-- * to the ADB, with changes if appropriate.
-- */
-- void *buf = abd_to_buf(vbio->vbio_abd);
-- abd_free(vbio->vbio_abd);
-- vbio->vbio_abd = NULL;
--
-- if (zio->io_type == ZIO_TYPE_READ)
-- abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-- else
-- abd_return_buf(zio->io_abd, buf, zio->io_size);
--}
-
--static void
--vbio_free(vbio_t *vbio)
--{
-- VERIFY0(atomic_read(&vbio->vbio_ref));
--
-- vbio_return_abd(vbio);
-+ ASSERT(zio);
-
-- kmem_free(vbio, sizeof (vbio_t));
--}
-+ /* Capture and log any errors */
-+#ifdef HAVE_1ARG_BIO_END_IO_T
-+ zio->io_error = BIO_END_IO_ERROR(bio);
-+#else
-+ zio->io_error = 0;
-+ if (error)
-+ zio->io_error = -(error);
-+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-+ zio->io_error = EIO;
-+#endif
-+ ASSERT3U(zio->io_error, >=, 0);
-
--static void
--vbio_put(vbio_t *vbio)
--{
-- if (atomic_dec_return(&vbio->vbio_ref) > 0)
-- return;
-+ if (zio->io_error)
-+ vdev_disk_error(zio);
-
-- /*
-- * This was the last reference, so the entire IO is completed. Clean
-- * up and submit it for processing.
-- */
-+ /* Return the BIO to the kernel */
-+ bio_put(bio);
-
- /*
-- * Get any data buf back to the original ABD, if necessary. We do this
-- * now so we can get the ZIO into the pipeline as quickly as possible,
-- * and then do the remaining cleanup after.
-+ * If we copied the ABD before issuing it, clean up and return the copy
-+ * to the ADB, with changes if appropriate.
- */
-- vbio_return_abd(vbio);
-+ if (vbio->vbio_abd != NULL) {
-+ void *buf = abd_to_buf(vbio->vbio_abd);
-+ abd_free(vbio->vbio_abd);
-+ vbio->vbio_abd = NULL;
-
-- zio_t *zio = vbio->vbio_zio;
-+ if (zio->io_type == ZIO_TYPE_READ)
-+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-+ else
-+ abd_return_buf(zio->io_abd, buf, zio->io_size);
-+ }
-
-- /*
-- * Set the overall error. If multiple BIOs returned an error, only the
-- * first will be taken; the others are dropped (see
-- * vdev_disk_io_rw_completion()). Its pretty much impossible for
-- * multiple IOs to the same device to fail with different errors, so
-- * there's no real risk.
-- */
-- zio->io_error = vbio->vbio_error;
-- if (zio->io_error)
-- vdev_disk_error(zio);
-+ /* Final cleanup */
-+ kmem_free(vbio, sizeof (vbio_t));
-
- /* All done, submit for processing */
- zio_delay_interrupt(zio);
--
-- /* Finish cleanup */
-- vbio_free(vbio);
--}
--
--BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
--{
-- vbio_t *vbio = bio->bi_private;
--
-- if (vbio->vbio_error == 0) {
--#ifdef HAVE_1ARG_BIO_END_IO_T
-- vbio->vbio_error = BIO_END_IO_ERROR(bio);
--#else
-- if (error)
-- vbio->vbio_error = -(error);
-- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-- vbio->vbio_error = EIO;
--#endif
-- }
--
-- /*
-- * Destroy the BIO. This is safe to do; the vbio owns its data and the
-- * kernel won't touch it again after the completion function runs.
-- */
-- bio_put(bio);
--
-- /* Drop this BIOs reference acquired by vbio_submit() */
-- vbio_put(vbio);
- }
-
- /*
-@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
- return (B_TRUE);
- }
-
--/* Iterator callback to submit ABD pages to the vbio. */
--static int
--vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
--{
-- vbio_t *vbio = priv;
-- return (vbio_add_page(vbio, page, len, off));
--}
--
- static int
- vdev_disk_io_rw(zio_t *zio)
- {
-@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
- }
-
- /* Allocate vbio, with a pointer to the borrowed ABD if necessary */
-- int error = 0;
-- vbio_t *vbio = vbio_alloc(zio, bdev);
-+ vbio_t *vbio = vbio_alloc(zio, bdev, flags);
- if (abd != zio->io_abd)
- vbio->vbio_abd = abd;
-
-- /* Fill it with pages */
-- error = abd_iterate_page_func(abd, 0, zio->io_size,
-- vdev_disk_fill_vbio_cb, vbio);
-- if (error != 0) {
-- vbio_free(vbio);
-- return (error);
-- }
--
-- vbio_submit(vbio, flags);
-+ /* Fill it with data pages and submit it to the kernel */
-+ vbio_submit(vbio, abd, zio->io_size);
- return (0);
- }
-
diff --git a/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch b/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
deleted file mode 100644
index 28dbbf9d..00000000
--- a/debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
+++ /dev/null
@@ -1,96 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Thu, 14 Mar 2024 10:57:30 +1100
-Subject: [PATCH] abd_iter_page: don't use compound heads on Linux <4.5
-
-Before 4.5 (specifically, torvalds/linux at ddc58f2), head and tail pages
-in a compound page were refcounted separately. This means that using the
-head page without taking a reference to it could see it cleaned up later
-before we're finished with it. Specifically, bio_add_page() would take a
-reference, and drop its reference after the bio completion callback
-returns.
-
-If the zio is executed immediately from the completion callback, this is
-usually ok, as any data is referenced through the tail page referenced
-by the ABD, and so becomes "live" that way. If there's a delay in zio
-execution (high load, error injection), then the head page can be freed,
-along with any dirty flags or other indicators that the underlying
-memory is used. Later, when the zio completes and that memory is
-accessed, its either unmapped and an unhandled fault takes down the
-entire system, or it is mapped and we end up messing around in someone
-else's memory. Both of these are very bad.
-
-The solution on these older kernels is to take a reference to the head
-page when we use it, and release it when we're done. There's not really
-a sensible way under our current structure to do this; the "best" would
-be to keep a list of head page references in the ABD, and release them
-when the ABD is freed.
-
-Since this additional overhead is totally unnecessary on 4.5+, where
-head and tail pages share refcounts, I've opted to simply not use the
-compound head in ABD page iteration there. This is theoretically less
-efficient (though cleaning up head page references would add overhead),
-but its safe, and we still get the other benefits of not mapping pages
-before adding them to a bio and not mis-splitting pages.
-
-There doesn't appear to be an obvious symbol name or config option we
-can match on to discover this behaviour in configure (and the mm/page
-APIs have changed a lot since then anyway), so I've gone with a simple
-version check.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-Closes #15533
-Closes #15588
-(cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542)
----
- module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
- 1 file changed, 14 insertions(+)
-
-diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
-index 3fe01c0b7..d3255dcbc 100644
---- a/module/os/linux/zfs/abd_os.c
-+++ b/module/os/linux/zfs/abd_os.c
-@@ -62,6 +62,7 @@
- #include <linux/kmap_compat.h>
- #include <linux/mm_compat.h>
- #include <linux/scatterlist.h>
-+#include <linux/version.h>
- #endif
-
- #ifdef _KERNEL
-@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
- }
- ASSERT(page);
-
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
- if (PageTail(page)) {
- /*
- * This page is part of a "compound page", which is a group of
-@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
- * To do this, we need to adjust the offset to be counted from
- * the head page. struct page for compound pages are stored
- * contiguously, so we can just adjust by a simple offset.
-+ *
-+ * Before kernel 4.5, compound page heads were refcounted
-+ * separately, such that moving back to the head page would
-+ * require us to take a reference to it and releasing it once
-+ * we're completely finished with it. In practice, that means
-+ * when our caller is done with the ABD, which we have no
-+ * insight into from here. Rather than contort this API to
-+ * track head page references on such ancient kernels, we just
-+ * compile this block out and use the tail pages directly. This
-+ * is slightly less efficient, but makes everything far
-+ * simpler.
- */
- struct page *head = compound_head(page);
- doff += ((page - head) * PAGESIZE);
- page = head;
- }
-+#endif
-
- /* final page and position within it */
- aiter->iter_page = page;
diff --git a/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch b/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
deleted file mode 100644
index e2f1422f..00000000
--- a/debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
+++ /dev/null
@@ -1,90 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Wed, 27 Mar 2024 13:11:12 +1100
-Subject: [PATCH] vdev_disk: default to classic submission for 2.2.x
-
-We don't want to change to brand-new code in the middle of a stable
-series, but we want it available to test for people running into page
-splitting issues.
-
-This commits make zfs_vdev_disk_classic=1 the default, and updates the
-documentation to better explain what's going on.
-
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
----
- man/man4/zfs.4 | 31 ++++++++++++++++++++++---------
- module/os/linux/zfs/vdev_disk.c | 8 +++++---
- 2 files changed, 27 insertions(+), 12 deletions(-)
-
-diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
-index 6a628e7f3..a98ec519a 100644
---- a/man/man4/zfs.4
-+++ b/man/man4/zfs.4
-@@ -1355,17 +1355,30 @@ This parameter only applies on Linux.
- This parameter is ignored if
- .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
- .
--.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
--If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
--and earlier.
--This "classic" method has known issues with highly fragmented IO requests and
--is slower on many workloads, but it has been in use for many years and is known
--to be very stable.
--If you set this parameter, please also open a bug report why you did so,
-+.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
-+Controls the method used to submit IO to the Linux block layer
-+(default
-+.Sy 1 "classic" Ns
-+)
-+.Pp
-+If set to 1, the "classic" method is used.
-+This is the method that has been in use since the earliest versions of
-+ZFS-on-Linux.
-+It has known issues with highly fragmented IO requests and is less efficient on
-+many workloads, but it well known and well understood.
-+.Pp
-+If set to 0, the "new" method is used.
-+This method is available since 2.2.4 and should resolve all known issues and be
-+far more efficient, but has not had as much testing.
-+In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
-+.Pp
-+It is not recommended that you change it except on advice from the OpenZFS
-+developers.
-+If you do change it, please also open a bug report describing why you did so,
- including the workload involved and any error messages.
- .Pp
--This parameter and the classic submission method will be removed once we have
--total confidence in the new method.
-+This parameter and the "classic" submission method will be removed in a future
-+release of OpenZFS once we have total confidence in the new method.
- .Pp
- This parameter only applies on Linux, and can only be set at module load time.
- .
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index 36468fc21..e1c19a085 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio)
- /*
- * This is the classic, battle-tested BIO submission code. Until we're totally
- * sure that the new code is safe and correct in all cases, this will remain
-- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
-- * load time.
-+ * available.
-+ *
-+ * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
-+ * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
- *
- * These functions have been renamed to vdev_classic_* to make it clear what
- * they belong to, but their implementations are unchanged.
-@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd)
- * BIO submission method. See comment above about vdev_classic.
- * Set zfs_vdev_disk_classic=0 for new, =1 for classic
- */
--static uint_t zfs_vdev_disk_classic = 0; /* default new */
-+static uint_t zfs_vdev_disk_classic = 1; /* default classic */
-
- /* Set submission function from module parameter */
- static int
diff --git a/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch b/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
deleted file mode 100644
index 027f299d..00000000
--- a/debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
+++ /dev/null
@@ -1,104 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Robert Evans <rrevans at gmail.com>
-Date: Mon, 25 Mar 2024 17:56:49 -0400
-Subject: [PATCH] Fix corruption caused by mmap flushing problems
-
-1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
- already in writeback unless data-integrity sync is requested.
-
-2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
- skipped due to DMU pushing back on TX assign.
-
-3) Add missing mmap flush when doing block cloning.
-
-4) While here, pass errors from putpage to writepage/writepages.
-
-This change fixes corruption edge cases, but unfortunately adds
-synchronous ZIL flushes for dirty mmap pages to llseek and bclone
-operations. It may be possible to avoid these sync writes later
-but would need more tricky refactoring of the writeback code.
-
-Reviewed-by: Alexander Motin <mav at FreeBSD.org>
-Reviewed-by: Brian Behlendorf <behlendorf1 at llnl.gov>
-Signed-off-by: Robert Evans <evansr at google.com>
-Closes #15933
-Closes #16019
----
- module/os/linux/zfs/zfs_vnops_os.c | 5 +----
- module/os/linux/zfs/zpl_file.c | 8 ++++----
- module/zfs/zfs_vnops.c | 6 +++++-
- 3 files changed, 10 insertions(+), 9 deletions(-)
-
-diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
-index c06a75662..7c473bc7e 100644
---- a/module/os/linux/zfs/zfs_vnops_os.c
-+++ b/module/os/linux/zfs/zfs_vnops_os.c
-@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
-
-- err = dmu_tx_assign(tx, TXG_NOWAIT);
-+ err = dmu_tx_assign(tx, TXG_WAIT);
- if (err != 0) {
-- if (err == ERESTART)
-- dmu_tx_wait(tx);
--
- dmu_tx_abort(tx);
- #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
- filemap_dirty_folio(page_mapping(pp), page_folio(pp));
-diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
-index 3caa0fc6c..9dec52215 100644
---- a/module/os/linux/zfs/zpl_file.c
-+++ b/module/os/linux/zfs/zpl_file.c
-@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
- {
- boolean_t *for_sync = data;
- fstrans_cookie_t cookie;
-+ int ret;
-
- ASSERT(PageLocked(pp));
- ASSERT(!PageWriteback(pp));
-
- cookie = spl_fstrans_mark();
-- (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
-+ ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
- spl_fstrans_unmark(cookie);
-
-- return (0);
-+ return (ret);
- }
-
- #ifdef HAVE_WRITEPAGE_T_FOLIO
- static int
- zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
- {
-- (void) zpl_putpage(&pp->page, wbc, data);
-- return (0);
-+ return (zpl_putpage(&pp->page, wbc, data));
- }
- #endif
-
-diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
-index 2b37834d5..7020f88ec 100644
---- a/module/zfs/zfs_vnops.c
-+++ b/module/zfs/zfs_vnops.c
-@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
-
- /* Flush any mmap()'d data to disk */
- if (zn_has_cached_data(zp, 0, file_sz - 1))
-- zn_flush_cached_data(zp, B_FALSE);
-+ zn_flush_cached_data(zp, B_TRUE);
-
- lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
- error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
-@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
- }
- }
-
-+ /* Flush any mmap()'d data to disk */
-+ if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
-+ zn_flush_cached_data(inzp, B_TRUE);
-+
- /*
- * Maintain predictable lock order.
- */
diff --git a/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch b/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
deleted file mode 100644
index 83eac378..00000000
--- a/debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Rob Norris <rob.norris at klarasystems.com>
-Date: Tue, 2 Apr 2024 15:14:54 +1100
-Subject: [PATCH] vdev_disk: don't touch vbio after its handed off to the
- kernel
-
-After IO is unplugged, it may complete immediately and vbio_completion
-be called on interrupt context. That may interrupt or deschedule our
-task. If its the last bio, the vbio will be freed. Then, we get
-rescheduled, and try to write to freed memory through vbio->.
-
-This patch just removes the the cleanup, and the corresponding assert.
-These were leftovers from a previous iteration of vbio_submit() and were
-always "belt and suspenders" ops anyway, never strictly required.
-
-Reported-by: Rich Ercolani <rincebrain at gmail.com>
-Signed-off-by: Rob Norris <rob.norris at klarasystems.com>
-Sponsored-by: Klara, Inc.
-Sponsored-by: Wasabi Technology, Inc.
-(cherry picked from commit 34f662ad22206af6852020fd923ceccd836a855f)
-Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
----
- module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
- 1 file changed, 6 insertions(+), 5 deletions(-)
-
-diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
-index e1c19a085..62c7aa14f 100644
---- a/module/os/linux/zfs/vdev_disk.c
-+++ b/module/os/linux/zfs/vdev_disk.c
-@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
- static void
- vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
- {
-- ASSERT(vbio->vbio_bdev);
--
- /*
- * We plug so we can submit the BIOs as we go and only unplug them when
- * they are fully created and submitted. This is important; if we don't
-@@ -777,12 +775,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
- vbio->vbio_bio->bi_end_io = vbio_completion;
- vbio->vbio_bio->bi_private = vbio;
-
-+ /*
-+ * Once submitted, vbio_bio now owns vbio (through bi_private) and we
-+ * can't touch it again. The bio may complete and vbio_completion() be
-+ * called and free the vbio before this task is run again, so we must
-+ * consider it invalid from this point.
-+ */
- vdev_submit_bio(vbio->vbio_bio);
-
- blk_finish_plug(&plug);
--
-- vbio->vbio_bio = NULL;
-- vbio->vbio_bdev = NULL;
- }
-
- /* IO completion callback */
diff --git a/debian/patches/series b/debian/patches/series
index 7c1a5c6c..35f81d13 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -9,17 +9,3 @@
0009-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
-0012-udev-correctly-handle-partition-16-and-later.patch
-0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
-0014-linux-5.4-compat-page_size.patch
-0015-abd-add-page-iterator.patch
-0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
-0017-vdev_disk-reorganise-vdev_disk_io_start.patch
-0018-vdev_disk-make-read-write-IO-function-configurable.patch
-0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
-0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
-0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
-0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
-0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
-0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
-0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
diff --git a/upstream b/upstream
index c883088d..25665920 160000
--- a/upstream
+++ b/upstream
@@ -1 +1 @@
-Subproject commit c883088df83ced3a2b8b38e6d89a5e63fb153ee4
+Subproject commit 2566592045780e7be7afc899c2496b1ae3af4f4d
--
2.39.2
More information about the pve-devel
mailing list