[pve-devel] [PATCH qemu 4/4] cherry-pick upstream fixes for 7.0.0
Fabian Ebner
f.ebner at proxmox.com
Mon Jun 27 13:05:43 CEST 2022
coming in via qemu-stable (except for the vdmk fix, which was tagged
for-7.0 on the qemu-devel list, but didn't make it into the release).
Also took the chance to switch the gluster fix to the version that
made it into upstream.
Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
---
...rrectly-set-max_pdiscard-which-is-in.patch | 38 ------
...k-gluster-correctly-set-max_pdiscard.patch | 47 +++++++
...003-block-vmdk-Fix-reopening-bs-file.patch | 129 ++++++++++++++++++
...balanced-plugged-counter-in-laio_io_.patch | 44 ++++++
...erflow-in-snprintf-string-formatting.patch | 100 ++++++++++++++
...Fix-disabling-MPX-on-cpu-host-with-M.patch | 48 +++++++
...ontext-use-QEMU_DEFINE_STATIC_CO_TLS.patch | 121 ++++++++++++++++
...outine-use-QEMU_DEFINE_STATIC_CO_TLS.patch | 123 +++++++++++++++++
...ame-qemu_coroutine_inc-dec_pool_size.patch | 90 ++++++++++++
...outine-Revert-to-constant-batch-size.patch | 121 ++++++++++++++++
...-not-consult-nonexistent-host-leaves.patch | 117 ++++++++++++++++
...ctrl-and-event-handler-functions-in-.patch | 108 +++++++++++++++
...t-waste-CPU-polling-the-event-virtqu.patch | 91 ++++++++++++
...k-descriptor-chain-in-private-at-SVQ.patch | 102 ++++++++++++++
...Fix-device-s-used-descriptor-dequeue.patch | 62 +++++++++
...ex-calculus-at-vhost_vdpa_get_vring_.patch | 39 ++++++
...ex-calculus-at-vhost_vdpa_svqs_start.patch | 35 +++++
...virtio-Replace-g_memdup-by-g_memdup2.patch | 74 ++++++++++
...Fix-element-in-vhost_svq_add-failure.patch | 47 +++++++
debian/patches/series | 19 ++-
20 files changed, 1516 insertions(+), 39 deletions(-)
delete mode 100644 debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard-which-is-in.patch
create mode 100644 debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard.patch
create mode 100644 debian/patches/extra/0003-block-vmdk-Fix-reopening-bs-file.patch
create mode 100644 debian/patches/extra/0004-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch
create mode 100644 debian/patches/extra/0005-pci-fix-overflow-in-snprintf-string-formatting.patch
create mode 100644 debian/patches/extra/0006-target-i386-kvm-Fix-disabling-MPX-on-cpu-host-with-M.patch
create mode 100644 debian/patches/extra/0007-coroutine-ucontext-use-QEMU_DEFINE_STATIC_CO_TLS.patch
create mode 100644 debian/patches/extra/0008-coroutine-use-QEMU_DEFINE_STATIC_CO_TLS.patch
create mode 100644 debian/patches/extra/0009-coroutine-Rename-qemu_coroutine_inc-dec_pool_size.patch
create mode 100644 debian/patches/extra/0010-coroutine-Revert-to-constant-batch-size.patch
create mode 100644 debian/patches/extra/0011-target-i386-do-not-consult-nonexistent-host-leaves.patch
create mode 100644 debian/patches/extra/0012-virtio-scsi-fix-ctrl-and-event-handler-functions-in-.patch
create mode 100644 debian/patches/extra/0013-virtio-scsi-don-t-waste-CPU-polling-the-event-virtqu.patch
create mode 100644 debian/patches/extra/0014-vhost-Track-descriptor-chain-in-private-at-SVQ.patch
create mode 100644 debian/patches/extra/0015-vhost-Fix-device-s-used-descriptor-dequeue.patch
create mode 100644 debian/patches/extra/0016-vdpa-Fix-bad-index-calculus-at-vhost_vdpa_get_vring_.patch
create mode 100644 debian/patches/extra/0017-vdpa-Fix-index-calculus-at-vhost_vdpa_svqs_start.patch
create mode 100644 debian/patches/extra/0018-hw-virtio-Replace-g_memdup-by-g_memdup2.patch
create mode 100644 debian/patches/extra/0019-vhost-Fix-element-in-vhost_svq_add-failure.patch
diff --git a/debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard-which-is-in.patch b/debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard-which-is-in.patch
deleted file mode 100644
index 91676e4..0000000
--- a/debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard-which-is-in.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner at proxmox.com>
-Date: Fri, 6 May 2022 14:38:35 +0200
-Subject: [PATCH] block/gluster: correctly set max_pdiscard which is int64_t
-
-Previously, max_pdiscard would be zero in the following assertion:
-qemu-system-x86_64: ../block/io.c:3166: bdrv_co_pdiscard: Assertion
-`max_pdiscard >= bs->bl.request_alignment' failed.
-
-Fixes: 0c8022876f ("block: use int64_t instead of int in driver discard handlers")
-Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
----
- block/gluster.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/block/gluster.c b/block/gluster.c
-index 398976bc66..592e71b22a 100644
---- a/block/gluster.c
-+++ b/block/gluster.c
-@@ -891,7 +891,7 @@ out:
- static void qemu_gluster_refresh_limits(BlockDriverState *bs, Error **errp)
- {
- bs->bl.max_transfer = GLUSTER_MAX_TRANSFER;
-- bs->bl.max_pdiscard = SIZE_MAX;
-+ bs->bl.max_pdiscard = INT64_MAX;
- }
-
- static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
-@@ -1304,7 +1304,7 @@ static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
- GlusterAIOCB acb;
- BDRVGlusterState *s = bs->opaque;
-
-- assert(bytes <= SIZE_MAX); /* rely on max_pdiscard */
-+ assert(bytes <= INT64_MAX); /* rely on max_pdiscard */
-
- acb.size = 0;
- acb.ret = 0;
diff --git a/debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard.patch b/debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard.patch
new file mode 100644
index 0000000..c0b8ee0
--- /dev/null
+++ b/debian/patches/extra/0002-block-gluster-correctly-set-max_pdiscard.patch
@@ -0,0 +1,47 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner at proxmox.com>
+Date: Fri, 20 May 2022 09:59:22 +0200
+Subject: [PATCH] block/gluster: correctly set max_pdiscard
+
+On 64-bit platforms, assigning SIZE_MAX to the int64_t max_pdiscard
+results in a negative value, and the following assertion would trigger
+down the line (it's not the same max_pdiscard, but computed from the
+other one):
+qemu-system-x86_64: ../block/io.c:3166: bdrv_co_pdiscard: Assertion
+`max_pdiscard >= bs->bl.request_alignment' failed.
+
+On 32-bit platforms, it's fine to keep using SIZE_MAX.
+
+The assertion in qemu_gluster_co_pdiscard() is checking that the value
+of 'bytes' can safely be passed to glfs_discard_async(), which takes a
+size_t for the argument in question, so it is kept as is. And since
+max_pdiscard is still <= SIZE_MAX, relying on max_pdiscard is still
+fine.
+
+Fixes: 0c8022876f ("block: use int64_t instead of int in driver discard handlers")
+Cc: qemu-stable at nongnu.org
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+Message-Id: <20220520075922.43972-1-f.ebner at proxmox.com>
+Reviewed-by: Eric Blake <eblake at redhat.com>
+Reviewed-by: Stefano Garzarella <sgarzare at redhat.com>
+Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov at yandex-team.ru>
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+(cherry-picked from commit 9b38fc56c054c7de65fa3bf7cdd82b32654f6b7d)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ block/gluster.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/block/gluster.c b/block/gluster.c
+index 80b75cb96c..1079b6186b 100644
+--- a/block/gluster.c
++++ b/block/gluster.c
+@@ -901,7 +901,7 @@ out:
+ static void qemu_gluster_refresh_limits(BlockDriverState *bs, Error **errp)
+ {
+ bs->bl.max_transfer = GLUSTER_MAX_TRANSFER;
+- bs->bl.max_pdiscard = SIZE_MAX;
++ bs->bl.max_pdiscard = MIN(SIZE_MAX, INT64_MAX);
+ }
+
+ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
diff --git a/debian/patches/extra/0003-block-vmdk-Fix-reopening-bs-file.patch b/debian/patches/extra/0003-block-vmdk-Fix-reopening-bs-file.patch
new file mode 100644
index 0000000..65c43de
--- /dev/null
+++ b/debian/patches/extra/0003-block-vmdk-Fix-reopening-bs-file.patch
@@ -0,0 +1,129 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Hanna Reitz <hreitz at redhat.com>
+Date: Mon, 14 Mar 2022 17:27:18 +0100
+Subject: [PATCH] block/vmdk: Fix reopening bs->file
+
+VMDK disk data is stored in extents, which may or may not be separate
+from bs->file. VmdkExtent.file points to where they are stored. Each
+that is stored in bs->file will simply reuse the exact pointer value of
+bs->file.
+
+(That is why vmdk_free_extents() will unref VmdkExtent.file (e->file)
+only if e->file != bs->file.)
+
+Reopen operations can change bs->file (they will replace the whole
+BdrvChild object, not just the BDS stored in that BdrvChild), and then
+we will need to change all .file pointers of all such VmdkExtents to
+point to the new BdrvChild.
+
+In vmdk_reopen_prepare(), we have to check which VmdkExtents are
+affected, and in vmdk_reopen_commit(), we can modify them. We have to
+split this because:
+- The new BdrvChild is created only after prepare, so we can change
+ VmdkExtent.file only in commit
+- In commit, there no longer is any (valid) reference to the old
+ BdrvChild object, so there would be nothing to compare VmdkExtent.file
+ against to see whether it was equal to bs->file before reopening
+ (There is BDRVReopenState.old_file_bs, but the old bs->file
+ BdrvChild's .bs pointer will be NULL-ed when the new BdrvChild is
+ created, and so we cannot compare VmdkExtent.file->bs against
+ BDRVReopenState.old_file_bs)
+
+Signed-off-by: Hanna Reitz <hreitz at redhat.com>
+Message-Id: <20220314162719.65384-2-hreitz at redhat.com>
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+(cherry-picked from commit 6d17e2879854d7d0e623c06a9286085e97bf3545)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ block/vmdk.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 55 insertions(+), 1 deletion(-)
+
+diff --git a/block/vmdk.c b/block/vmdk.c
+index 37c0946066..38e5ab3806 100644
+--- a/block/vmdk.c
++++ b/block/vmdk.c
+@@ -178,6 +178,10 @@ typedef struct BDRVVmdkState {
+ char *create_type;
+ } BDRVVmdkState;
+
++typedef struct BDRVVmdkReopenState {
++ bool *extents_using_bs_file;
++} BDRVVmdkReopenState;
++
+ typedef struct VmdkMetaData {
+ unsigned int l1_index;
+ unsigned int l2_index;
+@@ -400,15 +404,63 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
+ return 1;
+ }
+
+-/* We have nothing to do for VMDK reopen, stubs just return success */
+ static int vmdk_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+ {
++ BDRVVmdkState *s;
++ BDRVVmdkReopenState *rs;
++ int i;
++
+ assert(state != NULL);
+ assert(state->bs != NULL);
++ assert(state->opaque == NULL);
++
++ s = state->bs->opaque;
++
++ rs = g_new0(BDRVVmdkReopenState, 1);
++ state->opaque = rs;
++
++ /*
++ * Check whether there are any extents stored in bs->file; if bs->file
++ * changes, we will need to update their .file pointers to follow suit
++ */
++ rs->extents_using_bs_file = g_new(bool, s->num_extents);
++ for (i = 0; i < s->num_extents; i++) {
++ rs->extents_using_bs_file[i] = s->extents[i].file == state->bs->file;
++ }
++
+ return 0;
+ }
+
++static void vmdk_reopen_clean(BDRVReopenState *state)
++{
++ BDRVVmdkReopenState *rs = state->opaque;
++
++ g_free(rs->extents_using_bs_file);
++ g_free(rs);
++ state->opaque = NULL;
++}
++
++static void vmdk_reopen_commit(BDRVReopenState *state)
++{
++ BDRVVmdkState *s = state->bs->opaque;
++ BDRVVmdkReopenState *rs = state->opaque;
++ int i;
++
++ for (i = 0; i < s->num_extents; i++) {
++ if (rs->extents_using_bs_file[i]) {
++ s->extents[i].file = state->bs->file;
++ }
++ }
++
++ vmdk_reopen_clean(state);
++}
++
++static void vmdk_reopen_abort(BDRVReopenState *state)
++{
++ vmdk_reopen_clean(state);
++}
++
+ static int vmdk_parent_open(BlockDriverState *bs)
+ {
+ char *p_name;
+@@ -3072,6 +3124,8 @@ static BlockDriver bdrv_vmdk = {
+ .bdrv_open = vmdk_open,
+ .bdrv_co_check = vmdk_co_check,
+ .bdrv_reopen_prepare = vmdk_reopen_prepare,
++ .bdrv_reopen_commit = vmdk_reopen_commit,
++ .bdrv_reopen_abort = vmdk_reopen_abort,
+ .bdrv_child_perm = bdrv_default_perms,
+ .bdrv_co_preadv = vmdk_co_preadv,
+ .bdrv_co_pwritev = vmdk_co_pwritev,
diff --git a/debian/patches/extra/0004-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch b/debian/patches/extra/0004-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch
new file mode 100644
index 0000000..cf130ac
--- /dev/null
+++ b/debian/patches/extra/0004-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch
@@ -0,0 +1,44 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Hajnoczi <stefanha at redhat.com>
+Date: Thu, 9 Jun 2022 17:47:11 +0100
+Subject: [PATCH] linux-aio: fix unbalanced plugged counter in laio_io_unplug()
+
+Every laio_io_plug() call has a matching laio_io_unplug() call. There is
+a plugged counter that tracks the number of levels of plugging and
+allows for nesting.
+
+The plugged counter must reflect the balance between laio_io_plug() and
+laio_io_unplug() calls accurately. Otherwise I/O stalls occur since
+io_submit(2) calls are skipped while plugged.
+
+Reported-by: Nikolay Tenev <nt at storpool.com>
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+Reviewed-by: Stefano Garzarella <sgarzare at redhat.com>
+Message-id: 20220609164712.1539045-2-stefanha at redhat.com
+Cc: Stefano Garzarella <sgarzare at redhat.com>
+Fixes: 68d7946648 ("linux-aio: add `dev_max_batch` parameter to laio_io_unplug()")
+[Stefano Garzarella suggested adding a Fixes tag.
+--Stefan]
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+(cherry-picked from commit f387cac5af030a58ac5a0dacf64cab5e5a4fe5c7)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ block/linux-aio.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/block/linux-aio.c b/block/linux-aio.c
+index 4c423fcccf..6078da7e42 100644
+--- a/block/linux-aio.c
++++ b/block/linux-aio.c
+@@ -363,8 +363,10 @@ void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
+ uint64_t dev_max_batch)
+ {
+ assert(s->io_q.plugged);
++ s->io_q.plugged--;
++
+ if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
+- (--s->io_q.plugged == 0 &&
++ (!s->io_q.plugged &&
+ !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
+ ioq_submit(s);
+ }
diff --git a/debian/patches/extra/0005-pci-fix-overflow-in-snprintf-string-formatting.patch b/debian/patches/extra/0005-pci-fix-overflow-in-snprintf-string-formatting.patch
new file mode 100644
index 0000000..3f2dfa9
--- /dev/null
+++ b/debian/patches/extra/0005-pci-fix-overflow-in-snprintf-string-formatting.patch
@@ -0,0 +1,100 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Claudio Fontana <cfontana at suse.de>
+Date: Tue, 31 May 2022 13:47:07 +0200
+Subject: [PATCH] pci: fix overflow in snprintf string formatting
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+the code in pcibus_get_fw_dev_path contained the potential for a
+stack buffer overflow of 1 byte, potentially writing to the stack an
+extra NUL byte.
+
+This overflow could happen if the PCI slot is >= 0x10000000,
+and the PCI function is >= 0x10000000, due to the size parameter
+of snprintf being incorrectly calculated in the call:
+
+ if (PCI_FUNC(d->devfn))
+ snprintf(path + off, sizeof(path) + off, ",%x", PCI_FUNC(d->devfn));
+
+since the off obtained from a previous call to snprintf is added
+instead of subtracted from the total available size of the buffer.
+
+Without the accurate size guard from snprintf, we end up writing in the
+worst case:
+
+name (32) + "@" (1) + SLOT (8) + "," (1) + FUNC (8) + term NUL (1) = 51 bytes
+
+In order to provide something more robust, replace all of the code in
+pcibus_get_fw_dev_path with a single call to g_strdup_printf,
+so there is no need to rely on manual calculations.
+
+Found by compiling QEMU with FORTIFY_SOURCE=3 as the error:
+
+*** buffer overflow detected ***: terminated
+
+Thread 1 "qemu-system-x86" received signal SIGABRT, Aborted.
+[Switching to Thread 0x7ffff642c380 (LWP 121307)]
+0x00007ffff71ff55c in __pthread_kill_implementation () from /lib64/libc.so.6
+(gdb) bt
+ #0 0x00007ffff71ff55c in __pthread_kill_implementation () at /lib64/libc.so.6
+ #1 0x00007ffff71ac6f6 in raise () at /lib64/libc.so.6
+ #2 0x00007ffff7195814 in abort () at /lib64/libc.so.6
+ #3 0x00007ffff71f279e in __libc_message () at /lib64/libc.so.6
+ #4 0x00007ffff729767a in __fortify_fail () at /lib64/libc.so.6
+ #5 0x00007ffff7295c36 in () at /lib64/libc.so.6
+ #6 0x00007ffff72957f5 in __snprintf_chk () at /lib64/libc.so.6
+ #7 0x0000555555b1c1fd in pcibus_get_fw_dev_path ()
+ #8 0x0000555555f2bde4 in qdev_get_fw_dev_path_helper.constprop ()
+ #9 0x0000555555f2bd86 in qdev_get_fw_dev_path_helper.constprop ()
+ #10 0x00005555559a6e5d in get_boot_device_path ()
+ #11 0x00005555559a712c in get_boot_devices_list ()
+ #12 0x0000555555b1a3d0 in fw_cfg_machine_reset ()
+ #13 0x0000555555bf4c2d in pc_machine_reset ()
+ #14 0x0000555555c66988 in qemu_system_reset ()
+ #15 0x0000555555a6dff6 in qdev_machine_creation_done ()
+ #16 0x0000555555c79186 in qmp_x_exit_preconfig.part ()
+ #17 0x0000555555c7b459 in qemu_init ()
+ #18 0x0000555555960a29 in main ()
+
+Found-by: Dario Faggioli <Dario Faggioli <dfaggioli at suse.com>
+Found-by: Martin Liška <martin.liska at suse.com>
+Cc: qemu-stable at nongnu.org
+Signed-off-by: Claudio Fontana <cfontana at suse.de>
+Message-Id: <20220531114707.18830-1-cfontana at suse.de>
+Reviewed-by: Ani Sinha <ani at anisinha.ca>
+(cherry-picked from commit 36f18c6989a3d1ff1d7a0e50b0868ef3958299b4)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/pci/pci.c | 18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+diff --git a/hw/pci/pci.c b/hw/pci/pci.c
+index dae9119bfe..c69b412434 100644
+--- a/hw/pci/pci.c
++++ b/hw/pci/pci.c
+@@ -2625,15 +2625,15 @@ static char *pci_dev_fw_name(DeviceState *dev, char *buf, int len)
+ static char *pcibus_get_fw_dev_path(DeviceState *dev)
+ {
+ PCIDevice *d = (PCIDevice *)dev;
+- char path[50], name[33];
+- int off;
+-
+- off = snprintf(path, sizeof(path), "%s@%x",
+- pci_dev_fw_name(dev, name, sizeof name),
+- PCI_SLOT(d->devfn));
+- if (PCI_FUNC(d->devfn))
+- snprintf(path + off, sizeof(path) + off, ",%x", PCI_FUNC(d->devfn));
+- return g_strdup(path);
++ char name[33];
++ int has_func = !!PCI_FUNC(d->devfn);
++
++ return g_strdup_printf("%s@%x%s%.*x",
++ pci_dev_fw_name(dev, name, sizeof(name)),
++ PCI_SLOT(d->devfn),
++ has_func ? "," : "",
++ has_func,
++ PCI_FUNC(d->devfn));
+ }
+
+ static char *pcibus_get_dev_path(DeviceState *dev)
diff --git a/debian/patches/extra/0006-target-i386-kvm-Fix-disabling-MPX-on-cpu-host-with-M.patch b/debian/patches/extra/0006-target-i386-kvm-Fix-disabling-MPX-on-cpu-host-with-M.patch
new file mode 100644
index 0000000..d393ed1
--- /dev/null
+++ b/debian/patches/extra/0006-target-i386-kvm-Fix-disabling-MPX-on-cpu-host-with-M.patch
@@ -0,0 +1,48 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Maciej S. Szmigiero" <maciej.szmigiero at oracle.com>
+Date: Mon, 23 May 2022 18:26:58 +0200
+Subject: [PATCH] target/i386/kvm: Fix disabling MPX on "-cpu host" with
+ MPX-capable host
+
+Since KVM commit 5f76f6f5ff96 ("KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled")
+it is not possible to disable MPX on a "-cpu host" just by adding "-mpx"
+there if the host CPU does indeed support MPX.
+QEMU will fail to set MSR_IA32_VMX_TRUE_{EXIT,ENTRY}_CTLS MSRs in this case
+and so trigger an assertion failure.
+
+Instead, besides "-mpx" one has to explicitly add also
+"-vmx-exit-clear-bndcfgs" and "-vmx-entry-load-bndcfgs" to QEMU command
+line to make it work, which is a bit convoluted.
+
+Make the MPX-related bits in FEAT_VMX_{EXIT,ENTRY}_CTLS dependent on MPX
+being actually enabled so such workarounds are no longer necessary.
+
+Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero at oracle.com>
+Message-Id: <51aa2125c76363204cc23c27165e778097c33f0b.1653323077.git.maciej.szmigiero at oracle.com>
+Cc: qemu-stable at nongnu.org
+Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
+(cherry-picked from commit 267b5e7e378afd260004cb37a66a6fcd641e3b53)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ target/i386/cpu.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/target/i386/cpu.c b/target/i386/cpu.c
+index cb6b5467d0..6e6945139b 100644
+--- a/target/i386/cpu.c
++++ b/target/i386/cpu.c
+@@ -1327,6 +1327,14 @@ static FeatureDep feature_dependencies[] = {
+ .from = { FEAT_7_0_EBX, CPUID_7_0_EBX_INVPCID },
+ .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_INVPCID },
+ },
++ {
++ .from = { FEAT_7_0_EBX, CPUID_7_0_EBX_MPX },
++ .to = { FEAT_VMX_EXIT_CTLS, VMX_VM_EXIT_CLEAR_BNDCFGS },
++ },
++ {
++ .from = { FEAT_7_0_EBX, CPUID_7_0_EBX_MPX },
++ .to = { FEAT_VMX_ENTRY_CTLS, VMX_VM_ENTRY_LOAD_BNDCFGS },
++ },
+ {
+ .from = { FEAT_7_0_EBX, CPUID_7_0_EBX_RDSEED },
+ .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_RDSEED_EXITING },
diff --git a/debian/patches/extra/0007-coroutine-ucontext-use-QEMU_DEFINE_STATIC_CO_TLS.patch b/debian/patches/extra/0007-coroutine-ucontext-use-QEMU_DEFINE_STATIC_CO_TLS.patch
new file mode 100644
index 0000000..6c4d523
--- /dev/null
+++ b/debian/patches/extra/0007-coroutine-ucontext-use-QEMU_DEFINE_STATIC_CO_TLS.patch
@@ -0,0 +1,121 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Hajnoczi <stefanha at redhat.com>
+Date: Mon, 7 Mar 2022 15:38:51 +0000
+Subject: [PATCH] coroutine-ucontext: use QEMU_DEFINE_STATIC_CO_TLS()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Thread-Local Storage variables cannot be used directly from coroutine
+code because the compiler may optimize TLS variable accesses across
+qemu_coroutine_yield() calls. When the coroutine is re-entered from
+another thread the TLS variables from the old thread must no longer be
+used.
+
+Use QEMU_DEFINE_STATIC_CO_TLS() for the current and leader variables.
+
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+Message-Id: <20220307153853.602859-2-stefanha at redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug at amsat.org>
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+(cherry-picked from commit 34145a307d849d0b6734d0222a7aa0bb9eef7407)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ util/coroutine-ucontext.c | 38 ++++++++++++++++++++++++--------------
+ 1 file changed, 24 insertions(+), 14 deletions(-)
+
+diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
+index 904b375192..127d5a13c8 100644
+--- a/util/coroutine-ucontext.c
++++ b/util/coroutine-ucontext.c
+@@ -25,6 +25,7 @@
+ #include "qemu/osdep.h"
+ #include <ucontext.h>
+ #include "qemu/coroutine_int.h"
++#include "qemu/coroutine-tls.h"
+
+ #ifdef CONFIG_VALGRIND_H
+ #include <valgrind/valgrind.h>
+@@ -66,8 +67,8 @@ typedef struct {
+ /**
+ * Per-thread coroutine bookkeeping
+ */
+-static __thread CoroutineUContext leader;
+-static __thread Coroutine *current;
++QEMU_DEFINE_STATIC_CO_TLS(Coroutine *, current);
++QEMU_DEFINE_STATIC_CO_TLS(CoroutineUContext, leader);
+
+ /*
+ * va_args to makecontext() must be type 'int', so passing
+@@ -97,14 +98,15 @@ static inline __attribute__((always_inline))
+ void finish_switch_fiber(void *fake_stack_save)
+ {
+ #ifdef CONFIG_ASAN
++ CoroutineUContext *leaderp = get_ptr_leader();
+ const void *bottom_old;
+ size_t size_old;
+
+ __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old);
+
+- if (!leader.stack) {
+- leader.stack = (void *)bottom_old;
+- leader.stack_size = size_old;
++ if (!leaderp->stack) {
++ leaderp->stack = (void *)bottom_old;
++ leaderp->stack_size = size_old;
+ }
+ #endif
+ #ifdef CONFIG_TSAN
+@@ -161,8 +163,10 @@ static void coroutine_trampoline(int i0, int i1)
+
+ /* Initialize longjmp environment and switch back the caller */
+ if (!sigsetjmp(self->env, 0)) {
+- start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, leader.stack,
+- leader.stack_size);
++ CoroutineUContext *leaderp = get_ptr_leader();
++
++ start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save,
++ leaderp->stack, leaderp->stack_size);
+ start_switch_fiber_tsan(&fake_stack_save, self, true); /* true=caller */
+ siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
+ }
+@@ -297,7 +301,7 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+ int ret;
+ void *fake_stack_save = NULL;
+
+- current = to_;
++ set_current(to_);
+
+ ret = sigsetjmp(from->env, 0);
+ if (ret == 0) {
+@@ -315,18 +319,24 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+
+ Coroutine *qemu_coroutine_self(void)
+ {
+- if (!current) {
+- current = &leader.base;
++ Coroutine *self = get_current();
++ CoroutineUContext *leaderp = get_ptr_leader();
++
++ if (!self) {
++ self = &leaderp->base;
++ set_current(self);
+ }
+ #ifdef CONFIG_TSAN
+- if (!leader.tsan_co_fiber) {
+- leader.tsan_co_fiber = __tsan_get_current_fiber();
++ if (!leaderp->tsan_co_fiber) {
++ leaderp->tsan_co_fiber = __tsan_get_current_fiber();
+ }
+ #endif
+- return current;
++ return self;
+ }
+
+ bool qemu_in_coroutine(void)
+ {
+- return current && current->caller;
++ Coroutine *self = get_current();
++
++ return self && self->caller;
+ }
diff --git a/debian/patches/extra/0008-coroutine-use-QEMU_DEFINE_STATIC_CO_TLS.patch b/debian/patches/extra/0008-coroutine-use-QEMU_DEFINE_STATIC_CO_TLS.patch
new file mode 100644
index 0000000..b5c4d3b
--- /dev/null
+++ b/debian/patches/extra/0008-coroutine-use-QEMU_DEFINE_STATIC_CO_TLS.patch
@@ -0,0 +1,123 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Hajnoczi <stefanha at redhat.com>
+Date: Mon, 7 Mar 2022 15:38:52 +0000
+Subject: [PATCH] coroutine: use QEMU_DEFINE_STATIC_CO_TLS()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Thread-Local Storage variables cannot be used directly from coroutine
+code because the compiler may optimize TLS variable accesses across
+qemu_coroutine_yield() calls. When the coroutine is re-entered from
+another thread the TLS variables from the old thread must no longer be
+used.
+
+Use QEMU_DEFINE_STATIC_CO_TLS() for the current and leader variables.
+The alloc_pool QSLIST needs a typedef so the return value of
+get_ptr_alloc_pool() can be stored in a local variable.
+
+One example of why this code is necessary: a coroutine that yields
+before calling qemu_coroutine_create() to create another coroutine is
+affected by the TLS issue.
+
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+Message-Id: <20220307153853.602859-3-stefanha at redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug at amsat.org>
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+(cherry-picked from commit ac387a08a9c9f6b36757da912f0339c25f421f90)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ util/qemu-coroutine.c | 41 ++++++++++++++++++++++++-----------------
+ 1 file changed, 24 insertions(+), 17 deletions(-)
+
+diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
+index c03b2422ff..f3e8300c8d 100644
+--- a/util/qemu-coroutine.c
++++ b/util/qemu-coroutine.c
+@@ -18,6 +18,7 @@
+ #include "qemu/atomic.h"
+ #include "qemu/coroutine.h"
+ #include "qemu/coroutine_int.h"
++#include "qemu/coroutine-tls.h"
+ #include "block/aio.h"
+
+ /** Initial batch size is 64, and is increased on demand */
+@@ -29,17 +30,20 @@ enum {
+ static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
+ static unsigned int pool_batch_size = POOL_INITIAL_BATCH_SIZE;
+ static unsigned int release_pool_size;
+-static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
+-static __thread unsigned int alloc_pool_size;
+-static __thread Notifier coroutine_pool_cleanup_notifier;
++
++typedef QSLIST_HEAD(, Coroutine) CoroutineQSList;
++QEMU_DEFINE_STATIC_CO_TLS(CoroutineQSList, alloc_pool);
++QEMU_DEFINE_STATIC_CO_TLS(unsigned int, alloc_pool_size);
++QEMU_DEFINE_STATIC_CO_TLS(Notifier, coroutine_pool_cleanup_notifier);
+
+ static void coroutine_pool_cleanup(Notifier *n, void *value)
+ {
+ Coroutine *co;
+ Coroutine *tmp;
++ CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
+
+- QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
+- QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
++ QSLIST_FOREACH_SAFE(co, alloc_pool, pool_next, tmp) {
++ QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
+ qemu_coroutine_delete(co);
+ }
+ }
+@@ -49,27 +53,30 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
+ Coroutine *co = NULL;
+
+ if (CONFIG_COROUTINE_POOL) {
+- co = QSLIST_FIRST(&alloc_pool);
++ CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
++
++ co = QSLIST_FIRST(alloc_pool);
+ if (!co) {
+ if (release_pool_size > qatomic_read(&pool_batch_size)) {
+ /* Slow path; a good place to register the destructor, too. */
+- if (!coroutine_pool_cleanup_notifier.notify) {
+- coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
+- qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
++ Notifier *notifier = get_ptr_coroutine_pool_cleanup_notifier();
++ if (!notifier->notify) {
++ notifier->notify = coroutine_pool_cleanup;
++ qemu_thread_atexit_add(notifier);
+ }
+
+ /* This is not exact; there could be a little skew between
+ * release_pool_size and the actual size of release_pool. But
+ * it is just a heuristic, it does not need to be perfect.
+ */
+- alloc_pool_size = qatomic_xchg(&release_pool_size, 0);
+- QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
+- co = QSLIST_FIRST(&alloc_pool);
++ set_alloc_pool_size(qatomic_xchg(&release_pool_size, 0));
++ QSLIST_MOVE_ATOMIC(alloc_pool, &release_pool);
++ co = QSLIST_FIRST(alloc_pool);
+ }
+ }
+ if (co) {
+- QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+- alloc_pool_size--;
++ QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
++ set_alloc_pool_size(get_alloc_pool_size() - 1);
+ }
+ }
+
+@@ -93,9 +100,9 @@ static void coroutine_delete(Coroutine *co)
+ qatomic_inc(&release_pool_size);
+ return;
+ }
+- if (alloc_pool_size < qatomic_read(&pool_batch_size)) {
+- QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
+- alloc_pool_size++;
++ if (get_alloc_pool_size() < qatomic_read(&pool_batch_size)) {
++ QSLIST_INSERT_HEAD(get_ptr_alloc_pool(), co, pool_next);
++ set_alloc_pool_size(get_alloc_pool_size() + 1);
+ return;
+ }
+ }
diff --git a/debian/patches/extra/0009-coroutine-Rename-qemu_coroutine_inc-dec_pool_size.patch b/debian/patches/extra/0009-coroutine-Rename-qemu_coroutine_inc-dec_pool_size.patch
new file mode 100644
index 0000000..9459471
--- /dev/null
+++ b/debian/patches/extra/0009-coroutine-Rename-qemu_coroutine_inc-dec_pool_size.patch
@@ -0,0 +1,90 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Kevin Wolf <kwolf at redhat.com>
+Date: Tue, 10 May 2022 17:10:19 +0200
+Subject: [PATCH] coroutine: Rename qemu_coroutine_inc/dec_pool_size()
+
+It's true that these functions currently affect the batch size in which
+coroutines are reused (i.e. moved from the global release pool to the
+allocation pool of a specific thread), but this is a bug and will be
+fixed in a separate patch.
+
+In fact, the comment in the header file already just promises that it
+influences the pool size, so reflect this in the name of the functions.
+As a nice side effect, the shorter function name makes some line
+wrapping unnecessary.
+
+Cc: qemu-stable at nongnu.org
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+Message-Id: <20220510151020.105528-2-kwolf at redhat.com>
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+(cherry-picked from commit 98e3ab35054b946f7c2aba5408822532b0920b53)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/block/virtio-blk.c | 6 ++----
+ include/qemu/coroutine.h | 6 +++---
+ util/qemu-coroutine.c | 4 ++--
+ 3 files changed, 7 insertions(+), 9 deletions(-)
+
+diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
+index 540c38f829..6a1cc41877 100644
+--- a/hw/block/virtio-blk.c
++++ b/hw/block/virtio-blk.c
+@@ -1215,8 +1215,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
+ for (i = 0; i < conf->num_queues; i++) {
+ virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
+ }
+- qemu_coroutine_increase_pool_batch_size(conf->num_queues * conf->queue_size
+- / 2);
++ qemu_coroutine_inc_pool_size(conf->num_queues * conf->queue_size / 2);
+ virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+@@ -1253,8 +1252,7 @@ static void virtio_blk_device_unrealize(DeviceState *dev)
+ for (i = 0; i < conf->num_queues; i++) {
+ virtio_del_queue(vdev, i);
+ }
+- qemu_coroutine_decrease_pool_batch_size(conf->num_queues * conf->queue_size
+- / 2);
++ qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2);
+ qemu_del_vm_change_state_handler(s->change);
+ blockdev_mark_auto_del(s->blk);
+ virtio_cleanup(vdev);
+diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
+index c828a95ee0..5b621d1295 100644
+--- a/include/qemu/coroutine.h
++++ b/include/qemu/coroutine.h
+@@ -334,12 +334,12 @@ void coroutine_fn yield_until_fd_readable(int fd);
+ /**
+ * Increase coroutine pool size
+ */
+-void qemu_coroutine_increase_pool_batch_size(unsigned int additional_pool_size);
++void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size);
+
+ /**
+- * Devcrease coroutine pool size
++ * Decrease coroutine pool size
+ */
+-void qemu_coroutine_decrease_pool_batch_size(unsigned int additional_pool_size);
++void qemu_coroutine_dec_pool_size(unsigned int additional_pool_size);
+
+ #include "qemu/lockable.h"
+
+diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
+index f3e8300c8d..ea23929a74 100644
+--- a/util/qemu-coroutine.c
++++ b/util/qemu-coroutine.c
+@@ -212,12 +212,12 @@ AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co)
+ return co->ctx;
+ }
+
+-void qemu_coroutine_increase_pool_batch_size(unsigned int additional_pool_size)
++void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size)
+ {
+ qatomic_add(&pool_batch_size, additional_pool_size);
+ }
+
+-void qemu_coroutine_decrease_pool_batch_size(unsigned int removing_pool_size)
++void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size)
+ {
+ qatomic_sub(&pool_batch_size, removing_pool_size);
+ }
diff --git a/debian/patches/extra/0010-coroutine-Revert-to-constant-batch-size.patch b/debian/patches/extra/0010-coroutine-Revert-to-constant-batch-size.patch
new file mode 100644
index 0000000..711ffe0
--- /dev/null
+++ b/debian/patches/extra/0010-coroutine-Revert-to-constant-batch-size.patch
@@ -0,0 +1,121 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Kevin Wolf <kwolf at redhat.com>
+Date: Tue, 10 May 2022 17:10:20 +0200
+Subject: [PATCH] coroutine: Revert to constant batch size
+
+Commit 4c41c69e changed the way the coroutine pool is sized because for
+virtio-blk devices with a large queue size and heavy I/O, it was just
+too small and caused coroutines to be deleted and reallocated soon
+afterwards. The change made the size dynamic based on the number of
+queues and the queue size of virtio-blk devices.
+
+There are two important numbers here: Slightly simplified, when a
+coroutine terminates, it is generally stored in the global release pool
+up to a certain pool size, and if the pool is full, it is freed.
+Conversely, when allocating a new coroutine, the coroutines in the
+release pool are reused if the pool already has reached a certain
+minimum size (the batch size), otherwise we allocate new coroutines.
+
+The problem after commit 4c41c69e is that it not only increases the
+maximum pool size (which is the intended effect), but also the batch
+size for reusing coroutines (which is a bug). It means that in cases
+with many devices and/or a large queue size (which defaults to the
+number of vcpus for virtio-blk-pci), many thousand coroutines could be
+sitting in the release pool without being reused.
+
+This is not only a waste of memory and allocations, but it actually
+makes the QEMU process likely to hit the vm.max_map_count limit on Linux
+because each coroutine requires two mappings (its stack and the guard
+page for the stack), causing it to abort() in qemu_alloc_stack() because
+when the limit is hit, mprotect() starts to fail with ENOMEM.
+
+In order to fix the problem, change the batch size back to 64 to avoid
+uselessly accumulating coroutines in the release pool, but keep the
+dynamic maximum pool size so that coroutines aren't freed too early
+in heavy I/O scenarios.
+
+Note that this fix doesn't strictly make it impossible to hit the limit,
+but this would only happen if most of the coroutines are actually in use
+at the same time, not just sitting in a pool. This is the same behaviour
+as we already had before commit 4c41c69e. Fully preventing this would
+require allowing qemu_coroutine_create() to return an error, but it
+doesn't seem to be a scenario that people hit in practice.
+
+Cc: qemu-stable at nongnu.org
+Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2079938
+Fixes: 4c41c69e05fe28c0f95f8abd2ebf407e95a4f04b
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+Message-Id: <20220510151020.105528-3-kwolf at redhat.com>
+Tested-by: Hiroki Narukawa <hnarukaw at yahoo-corp.jp>
+Signed-off-by: Kevin Wolf <kwolf at redhat.com>
+(cherry-picked from commit 9ec7a59b5aad4b736871c378d30f5ef5ec51cb52)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ util/qemu-coroutine.c | 22 ++++++++++++++--------
+ 1 file changed, 14 insertions(+), 8 deletions(-)
+
+diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
+index ea23929a74..4a8bd63ef0 100644
+--- a/util/qemu-coroutine.c
++++ b/util/qemu-coroutine.c
+@@ -21,14 +21,20 @@
+ #include "qemu/coroutine-tls.h"
+ #include "block/aio.h"
+
+-/** Initial batch size is 64, and is increased on demand */
++/**
++ * The minimal batch size is always 64, coroutines from the release_pool are
++ * reused as soon as there are 64 coroutines in it. The maximum pool size starts
++ * with 64 and is increased on demand so that coroutines are not deleted even if
++ * they are not immediately reused.
++ */
+ enum {
+- POOL_INITIAL_BATCH_SIZE = 64,
++ POOL_MIN_BATCH_SIZE = 64,
++ POOL_INITIAL_MAX_SIZE = 64,
+ };
+
+ /** Free list to speed up creation */
+ static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
+-static unsigned int pool_batch_size = POOL_INITIAL_BATCH_SIZE;
++static unsigned int pool_max_size = POOL_INITIAL_MAX_SIZE;
+ static unsigned int release_pool_size;
+
+ typedef QSLIST_HEAD(, Coroutine) CoroutineQSList;
+@@ -57,7 +63,7 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
+
+ co = QSLIST_FIRST(alloc_pool);
+ if (!co) {
+- if (release_pool_size > qatomic_read(&pool_batch_size)) {
++ if (release_pool_size > POOL_MIN_BATCH_SIZE) {
+ /* Slow path; a good place to register the destructor, too. */
+ Notifier *notifier = get_ptr_coroutine_pool_cleanup_notifier();
+ if (!notifier->notify) {
+@@ -95,12 +101,12 @@ static void coroutine_delete(Coroutine *co)
+ co->caller = NULL;
+
+ if (CONFIG_COROUTINE_POOL) {
+- if (release_pool_size < qatomic_read(&pool_batch_size) * 2) {
++ if (release_pool_size < qatomic_read(&pool_max_size) * 2) {
+ QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
+ qatomic_inc(&release_pool_size);
+ return;
+ }
+- if (get_alloc_pool_size() < qatomic_read(&pool_batch_size)) {
++ if (get_alloc_pool_size() < qatomic_read(&pool_max_size)) {
+ QSLIST_INSERT_HEAD(get_ptr_alloc_pool(), co, pool_next);
+ set_alloc_pool_size(get_alloc_pool_size() + 1);
+ return;
+@@ -214,10 +220,10 @@ AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co)
+
+ void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size)
+ {
+- qatomic_add(&pool_batch_size, additional_pool_size);
++ qatomic_add(&pool_max_size, additional_pool_size);
+ }
+
+ void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size)
+ {
+- qatomic_sub(&pool_batch_size, removing_pool_size);
++ qatomic_sub(&pool_max_size, removing_pool_size);
+ }
diff --git a/debian/patches/extra/0011-target-i386-do-not-consult-nonexistent-host-leaves.patch b/debian/patches/extra/0011-target-i386-do-not-consult-nonexistent-host-leaves.patch
new file mode 100644
index 0000000..cf535bb
--- /dev/null
+++ b/debian/patches/extra/0011-target-i386-do-not-consult-nonexistent-host-leaves.patch
@@ -0,0 +1,117 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini at redhat.com>
+Date: Fri, 29 Apr 2022 21:16:28 +0200
+Subject: [PATCH] target/i386: do not consult nonexistent host leaves
+
+When cache_info_passthrough is requested, QEMU passes the host values
+of the cache information CPUID leaves down to the guest. However,
+it blindly assumes that the CPUID leaf exists on the host, and this
+cannot be guaranteed: for example, KVM has recently started to
+synthesize AMD leaves up to 0x80000021 in order to provide accurate
+CPU bug information to guests.
+
+Querying a nonexistent host leaf fills the output arguments of
+host_cpuid with data that (albeit deterministic) is nonsensical
+as cache information, namely the data in the highest Intel CPUID
+leaf. If said highest leaf is not ECX-dependent, this can even
+cause an infinite loop when kvm_arch_init_vcpu prepares the input
+to KVM_SET_CPUID2. The infinite loop is only terminated by an
+abort() when the array gets full.
+
+Reported-by: Maxim Levitsky <mlevitsk at redhat.com>
+Reviewed-by: Maxim Levitsky <mlevitsk at redhat.com>
+Cc: qemu-stable at nongnu.org
+Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
+(cherry-picked from commit 798d8ec0dacd4cc0034298d94f430c14f23e2919)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ target/i386/cpu.c | 41 ++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 36 insertions(+), 5 deletions(-)
+
+diff --git a/target/i386/cpu.c b/target/i386/cpu.c
+index 6e6945139b..c79e151887 100644
+--- a/target/i386/cpu.c
++++ b/target/i386/cpu.c
+@@ -5030,6 +5030,37 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
+ return r;
+ }
+
++static void x86_cpu_get_cache_cpuid(uint32_t func, uint32_t index,
++ uint32_t *eax, uint32_t *ebx,
++ uint32_t *ecx, uint32_t *edx)
++{
++ uint32_t level, unused;
++
++ /* Only return valid host leaves. */
++ switch (func) {
++ case 2:
++ case 4:
++ host_cpuid(0, 0, &level, &unused, &unused, &unused);
++ break;
++ case 0x80000005:
++ case 0x80000006:
++ case 0x8000001d:
++ host_cpuid(0x80000000, 0, &level, &unused, &unused, &unused);
++ break;
++ default:
++ return;
++ }
++
++ if (func > level) {
++ *eax = 0;
++ *ebx = 0;
++ *ecx = 0;
++ *edx = 0;
++ } else {
++ host_cpuid(func, index, eax, ebx, ecx, edx);
++ }
++}
++
+ /*
+ * Only for builtin_x86_defs models initialized with x86_register_cpudef_types.
+ */
+@@ -5288,7 +5319,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
+ case 2:
+ /* cache info: needed for Pentium Pro compatibility */
+ if (cpu->cache_info_passthrough) {
+- host_cpuid(index, 0, eax, ebx, ecx, edx);
++ x86_cpu_get_cache_cpuid(index, 0, eax, ebx, ecx, edx);
+ break;
+ } else if (cpu->vendor_cpuid_only && IS_AMD_CPU(env)) {
+ *eax = *ebx = *ecx = *edx = 0;
+@@ -5308,7 +5339,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
+ case 4:
+ /* cache info: needed for Core compatibility */
+ if (cpu->cache_info_passthrough) {
+- host_cpuid(index, count, eax, ebx, ecx, edx);
++ x86_cpu_get_cache_cpuid(index, count, eax, ebx, ecx, edx);
+ /* QEMU gives out its own APIC IDs, never pass down bits 31..26. */
+ *eax &= ~0xFC000000;
+ if ((*eax & 31) && cs->nr_cores > 1) {
+@@ -5710,7 +5741,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
+ case 0x80000005:
+ /* cache info (L1 cache) */
+ if (cpu->cache_info_passthrough) {
+- host_cpuid(index, 0, eax, ebx, ecx, edx);
++ x86_cpu_get_cache_cpuid(index, 0, eax, ebx, ecx, edx);
+ break;
+ }
+ *eax = (L1_DTLB_2M_ASSOC << 24) | (L1_DTLB_2M_ENTRIES << 16) |
+@@ -5723,7 +5754,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
+ case 0x80000006:
+ /* cache info (L2 cache) */
+ if (cpu->cache_info_passthrough) {
+- host_cpuid(index, 0, eax, ebx, ecx, edx);
++ x86_cpu_get_cache_cpuid(index, 0, eax, ebx, ecx, edx);
+ break;
+ }
+ *eax = (AMD_ENC_ASSOC(L2_DTLB_2M_ASSOC) << 28) |
+@@ -5783,7 +5814,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
+ case 0x8000001D:
+ *eax = 0;
+ if (cpu->cache_info_passthrough) {
+- host_cpuid(index, count, eax, ebx, ecx, edx);
++ x86_cpu_get_cache_cpuid(index, count, eax, ebx, ecx, edx);
+ break;
+ }
+ switch (count) {
diff --git a/debian/patches/extra/0012-virtio-scsi-fix-ctrl-and-event-handler-functions-in-.patch b/debian/patches/extra/0012-virtio-scsi-fix-ctrl-and-event-handler-functions-in-.patch
new file mode 100644
index 0000000..92a57cf
--- /dev/null
+++ b/debian/patches/extra/0012-virtio-scsi-fix-ctrl-and-event-handler-functions-in-.patch
@@ -0,0 +1,108 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Hajnoczi <stefanha at redhat.com>
+Date: Wed, 27 Apr 2022 15:35:36 +0100
+Subject: [PATCH] virtio-scsi: fix ctrl and event handler functions in
+ dataplane mode
+
+Commit f34e8d8b8d48d73f36a67b6d5e492ef9784b5012 ("virtio-scsi: prepare
+virtio_scsi_handle_cmd for dataplane") prepared the virtio-scsi cmd
+virtqueue handler function to be used in both the dataplane and
+non-datpalane code paths.
+
+It failed to convert the ctrl and event virtqueue handler functions,
+which are not designed to be called from the dataplane code path but
+will be since the ioeventfd is set up for those virtqueues when
+dataplane starts.
+
+Convert the ctrl and event virtqueue handler functions now so they
+operate correctly when called from the dataplane code path. Avoid code
+duplication by extracting this code into a helper function.
+
+Fixes: f34e8d8b8d48d73f36a67b6d5e492ef9784b5012 ("virtio-scsi: prepare virtio_scsi_handle_cmd for dataplane")
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+Reviewed-by: Paolo Bonzini <pbonzini at redhat.com>
+Message-id: 20220427143541.119567-2-stefanha at redhat.com
+[Fixed s/by used/be used/ typo pointed out by Michael Tokarev
+<mjt at tls.msk.ru>.
+--Stefan]
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+(cherry-picked from commit 2f743ef6366c2df4ef51ef3ae318138cdc0125ab)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/scsi/virtio-scsi.c | 42 +++++++++++++++++++++++++++---------------
+ 1 file changed, 27 insertions(+), 15 deletions(-)
+
+diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
+index 34a968ecfb..417fbc71d6 100644
+--- a/hw/scsi/virtio-scsi.c
++++ b/hw/scsi/virtio-scsi.c
+@@ -472,16 +472,32 @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
+ return progress;
+ }
+
++/*
++ * If dataplane is configured but not yet started, do so now and return true on
++ * success.
++ *
++ * Dataplane is started by the core virtio code but virtqueue handler functions
++ * can also be invoked when a guest kicks before DRIVER_OK, so this helper
++ * function helps us deal with manually starting ioeventfd in that case.
++ */
++static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s)
++{
++ if (!s->ctx || s->dataplane_started) {
++ return false;
++ }
++
++ virtio_device_start_ioeventfd(&s->parent_obj.parent_obj);
++ return !s->dataplane_fenced;
++}
++
+ static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+ {
+ VirtIOSCSI *s = (VirtIOSCSI *)vdev;
+
+- if (s->ctx) {
+- virtio_device_start_ioeventfd(vdev);
+- if (!s->dataplane_fenced) {
+- return;
+- }
++ if (virtio_scsi_defer_to_dataplane(s)) {
++ return;
+ }
++
+ virtio_scsi_acquire(s);
+ virtio_scsi_handle_ctrl_vq(s, vq);
+ virtio_scsi_release(s);
+@@ -720,12 +736,10 @@ static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq)
+ /* use non-QOM casts in the data path */
+ VirtIOSCSI *s = (VirtIOSCSI *)vdev;
+
+- if (s->ctx && !s->dataplane_started) {
+- virtio_device_start_ioeventfd(vdev);
+- if (!s->dataplane_fenced) {
+- return;
+- }
++ if (virtio_scsi_defer_to_dataplane(s)) {
++ return;
+ }
++
+ virtio_scsi_acquire(s);
+ virtio_scsi_handle_cmd_vq(s, vq);
+ virtio_scsi_release(s);
+@@ -855,12 +869,10 @@ static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq)
+ {
+ VirtIOSCSI *s = VIRTIO_SCSI(vdev);
+
+- if (s->ctx) {
+- virtio_device_start_ioeventfd(vdev);
+- if (!s->dataplane_fenced) {
+- return;
+- }
++ if (virtio_scsi_defer_to_dataplane(s)) {
++ return;
+ }
++
+ virtio_scsi_acquire(s);
+ virtio_scsi_handle_event_vq(s, vq);
+ virtio_scsi_release(s);
diff --git a/debian/patches/extra/0013-virtio-scsi-don-t-waste-CPU-polling-the-event-virtqu.patch b/debian/patches/extra/0013-virtio-scsi-don-t-waste-CPU-polling-the-event-virtqu.patch
new file mode 100644
index 0000000..238d456
--- /dev/null
+++ b/debian/patches/extra/0013-virtio-scsi-don-t-waste-CPU-polling-the-event-virtqu.patch
@@ -0,0 +1,91 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Hajnoczi <stefanha at redhat.com>
+Date: Wed, 27 Apr 2022 15:35:37 +0100
+Subject: [PATCH] virtio-scsi: don't waste CPU polling the event virtqueue
+
+The virtio-scsi event virtqueue is not emptied by its handler function.
+This is typical for rx virtqueues where the device uses buffers when
+some event occurs (e.g. a packet is received, an error condition
+happens, etc).
+
+Polling non-empty virtqueues wastes CPU cycles. We are not waiting for
+new buffers to become available, we are waiting for an event to occur,
+so it's a misuse of CPU resources to poll for buffers.
+
+Introduce the new virtio_queue_aio_attach_host_notifier_no_poll() API,
+which is identical to virtio_queue_aio_attach_host_notifier() except
+that it does not poll the virtqueue.
+
+Before this patch the following command-line consumed 100% CPU in the
+IOThread polling and calling virtio_scsi_handle_event():
+
+ $ qemu-system-x86_64 -M accel=kvm -m 1G -cpu host \
+ --object iothread,id=iothread0 \
+ --device virtio-scsi-pci,iothread=iothread0 \
+ --blockdev file,filename=test.img,aio=native,cache.direct=on,node-name=drive0 \
+ --device scsi-hd,drive=drive0
+
+After this patch CPU is no longer wasted.
+
+Reported-by: Nir Soffer <nsoffer at redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+Tested-by: Nir Soffer <nsoffer at redhat.com>
+Message-id: 20220427143541.119567-3-stefanha at redhat.com
+Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
+(cherry-picked from commit 38738f7dbbda90fbc161757b7f4be35b52205552)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/scsi/virtio-scsi-dataplane.c | 2 +-
+ hw/virtio/virtio.c | 13 +++++++++++++
+ include/hw/virtio/virtio.h | 1 +
+ 3 files changed, 15 insertions(+), 1 deletion(-)
+
+diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
+index 29575cbaf6..8bb6e6acfc 100644
+--- a/hw/scsi/virtio-scsi-dataplane.c
++++ b/hw/scsi/virtio-scsi-dataplane.c
+@@ -138,7 +138,7 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
+
+ aio_context_acquire(s->ctx);
+ virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx);
+- virtio_queue_aio_attach_host_notifier(vs->event_vq, s->ctx);
++ virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx);
+
+ for (i = 0; i < vs->conf.num_queues; i++) {
+ virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx);
+diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
+index 9d637e043e..67a873f54a 100644
+--- a/hw/virtio/virtio.c
++++ b/hw/virtio/virtio.c
+@@ -3534,6 +3534,19 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
+ virtio_queue_host_notifier_aio_poll_end);
+ }
+
++/*
++ * Same as virtio_queue_aio_attach_host_notifier() but without polling. Use
++ * this for rx virtqueues and similar cases where the virtqueue handler
++ * function does not pop all elements. When the virtqueue is left non-empty
++ * polling consumes CPU cycles and should not be used.
++ */
++void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx)
++{
++ aio_set_event_notifier(ctx, &vq->host_notifier, true,
++ virtio_queue_host_notifier_read,
++ NULL, NULL);
++}
++
+ void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx)
+ {
+ aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL, NULL, NULL);
+diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
+index b31c4507f5..b62a35fdca 100644
+--- a/include/hw/virtio/virtio.h
++++ b/include/hw/virtio/virtio.h
+@@ -317,6 +317,7 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
+ void virtio_queue_set_host_notifier_enabled(VirtQueue *vq, bool enabled);
+ void virtio_queue_host_notifier_read(EventNotifier *n);
+ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx);
++void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx);
+ void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx);
+ VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector);
+ VirtQueue *virtio_vector_next_queue(VirtQueue *vq);
diff --git a/debian/patches/extra/0014-vhost-Track-descriptor-chain-in-private-at-SVQ.patch b/debian/patches/extra/0014-vhost-Track-descriptor-chain-in-private-at-SVQ.patch
new file mode 100644
index 0000000..2f52fd3
--- /dev/null
+++ b/debian/patches/extra/0014-vhost-Track-descriptor-chain-in-private-at-SVQ.patch
@@ -0,0 +1,102 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma at redhat.com>
+Date: Thu, 12 May 2022 19:57:42 +0200
+Subject: [PATCH] vhost: Track descriptor chain in private at SVQ
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The device could have access to modify them, and it definitely have
+access when we implement packed vq. Harden SVQ maintaining a private
+copy of the descriptor chain. Other fields like buffer addresses are
+already maintained sepparatedly.
+
+Signed-off-by: Eugenio Pérez <eperezma at redhat.com>
+Message-Id: <20220512175747.142058-2-eperezma at redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst at redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst at redhat.com>
+(cherry-picked from commit 495fe3a78749c39c0e772c4e1a55d6cb8a7e5292)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 12 +++++++-----
+ hw/virtio/vhost-shadow-virtqueue.h | 6 ++++++
+ 2 files changed, 13 insertions(+), 5 deletions(-)
+
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index b232803d1b..3155801f50 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -138,6 +138,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+ for (n = 0; n < num; n++) {
+ if (more_descs || (n + 1 < num)) {
+ descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
++ descs[i].next = cpu_to_le16(svq->desc_next[i]);
+ } else {
+ descs[i].flags = flags;
+ }
+@@ -145,10 +146,10 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+ descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+ last = i;
+- i = cpu_to_le16(descs[i].next);
++ i = cpu_to_le16(svq->desc_next[i]);
+ }
+
+- svq->free_head = le16_to_cpu(descs[last].next);
++ svq->free_head = le16_to_cpu(svq->desc_next[last]);
+ }
+
+ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+@@ -336,7 +337,6 @@ static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+ uint32_t *len)
+ {
+- vring_desc_t *descs = svq->vring.desc;
+ const vring_used_t *used = svq->vring.used;
+ vring_used_elem_t used_elem;
+ uint16_t last_used;
+@@ -365,7 +365,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+ return NULL;
+ }
+
+- descs[used_elem.id].next = svq->free_head;
++ svq->desc_next[used_elem.id] = svq->free_head;
+ svq->free_head = used_elem.id;
+
+ *len = used_elem.len;
+@@ -540,8 +540,9 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+ svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+ memset(svq->vring.used, 0, device_size);
+ svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
++ svq->desc_next = g_new0(uint16_t, svq->vring.num);
+ for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+- svq->vring.desc[i].next = cpu_to_le16(i + 1);
++ svq->desc_next[i] = cpu_to_le16(i + 1);
+ }
+ }
+
+@@ -574,6 +575,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
+ virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+ }
+ svq->vq = NULL;
++ g_free(svq->desc_next);
+ g_free(svq->ring_id_maps);
+ qemu_vfree(svq->vring.desc);
+ qemu_vfree(svq->vring.used);
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index e5e24c536d..c132c994e9 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -53,6 +53,12 @@ typedef struct VhostShadowVirtqueue {
+ /* Next VirtQueue element that guest made available */
+ VirtQueueElement *next_guest_avail_elem;
+
++ /*
++ * Backup next field for each descriptor so we can recover securely, not
++ * needing to trust the device access.
++ */
++ uint16_t *desc_next;
++
+ /* Next head to expose to the device */
+ uint16_t shadow_avail_idx;
+
diff --git a/debian/patches/extra/0015-vhost-Fix-device-s-used-descriptor-dequeue.patch b/debian/patches/extra/0015-vhost-Fix-device-s-used-descriptor-dequeue.patch
new file mode 100644
index 0000000..497bcec
--- /dev/null
+++ b/debian/patches/extra/0015-vhost-Fix-device-s-used-descriptor-dequeue.patch
@@ -0,0 +1,62 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma at redhat.com>
+Date: Thu, 12 May 2022 19:57:43 +0200
+Subject: [PATCH] vhost: Fix device's used descriptor dequeue
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Only the first one of them were properly enqueued back.
+
+Fixes: 100890f7ca ("vhost: Shadow virtqueue buffers forwarding")
+
+Signed-off-by: Eugenio Pérez <eperezma at redhat.com>
+Message-Id: <20220512175747.142058-3-eperezma at redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst at redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst at redhat.com>
+(cherry-picked from commit 81abfa5724c9a6502d7a1d3a67c55f2a303a1170)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 17 +++++++++++++++--
+ 1 file changed, 15 insertions(+), 2 deletions(-)
+
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index 3155801f50..31fc50907d 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -334,12 +334,22 @@ static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+ svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+ }
+
++static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq,
++ uint16_t num, uint16_t i)
++{
++ for (uint16_t j = 0; j < (num - 1); ++j) {
++ i = le16_to_cpu(svq->desc_next[i]);
++ }
++
++ return i;
++}
++
+ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+ uint32_t *len)
+ {
+ const vring_used_t *used = svq->vring.used;
+ vring_used_elem_t used_elem;
+- uint16_t last_used;
++ uint16_t last_used, last_used_chain, num;
+
+ if (!vhost_svq_more_used(svq)) {
+ return NULL;
+@@ -365,7 +375,10 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+ return NULL;
+ }
+
+- svq->desc_next[used_elem.id] = svq->free_head;
++ num = svq->ring_id_maps[used_elem.id]->in_num +
++ svq->ring_id_maps[used_elem.id]->out_num;
++ last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
++ svq->desc_next[last_used_chain] = svq->free_head;
+ svq->free_head = used_elem.id;
+
+ *len = used_elem.len;
diff --git a/debian/patches/extra/0016-vdpa-Fix-bad-index-calculus-at-vhost_vdpa_get_vring_.patch b/debian/patches/extra/0016-vdpa-Fix-bad-index-calculus-at-vhost_vdpa_get_vring_.patch
new file mode 100644
index 0000000..6f33164
--- /dev/null
+++ b/debian/patches/extra/0016-vdpa-Fix-bad-index-calculus-at-vhost_vdpa_get_vring_.patch
@@ -0,0 +1,39 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma at redhat.com>
+Date: Thu, 12 May 2022 19:57:44 +0200
+Subject: [PATCH] vdpa: Fix bad index calculus at vhost_vdpa_get_vring_base
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Fixes: 6d0b222666 ("vdpa: Adapt vhost_vdpa_get_vring_base to SVQ")
+
+Acked-by: Jason Wang <jasowang at redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma at redhat.com>
+Message-Id: <20220512175747.142058-4-eperezma at redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst at redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst at redhat.com>
+(cherry-picked from commit 639036477ef890958415967e753ca2cbb348c16c)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/virtio/vhost-vdpa.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index 8adf7c0b92..8555a84f87 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -1170,11 +1170,11 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+ {
+ struct vhost_vdpa *v = dev->opaque;
++ int vdpa_idx = ring->index - dev->vq_index;
+ int ret;
+
+ if (v->shadow_vqs_enabled) {
+- VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+- ring->index);
++ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+ /*
+ * Setting base as last used idx, so destination will see as available
diff --git a/debian/patches/extra/0017-vdpa-Fix-index-calculus-at-vhost_vdpa_svqs_start.patch b/debian/patches/extra/0017-vdpa-Fix-index-calculus-at-vhost_vdpa_svqs_start.patch
new file mode 100644
index 0000000..8c74f0c
--- /dev/null
+++ b/debian/patches/extra/0017-vdpa-Fix-index-calculus-at-vhost_vdpa_svqs_start.patch
@@ -0,0 +1,35 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma at redhat.com>
+Date: Thu, 12 May 2022 19:57:45 +0200
+Subject: [PATCH] vdpa: Fix index calculus at vhost_vdpa_svqs_start
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+With the introduction of MQ the index of the vq needs to be calculated
+with the device model vq_index.
+
+Signed-off-by: Eugenio Pérez <eperezma at redhat.com>
+Acked-by: Jason Wang <jasowang at redhat.com>
+Message-Id: <20220512175747.142058-5-eperezma at redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst at redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst at redhat.com>
+(cherry-picked from commit 1c82fdfef8a227518ffecae9d419bcada995c202)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/virtio/vhost-vdpa.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index 8555a84f87..aa43cfa7d9 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -1016,7 +1016,7 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
+ VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+ struct vhost_vring_addr addr = {
+- .index = i,
++ .index = dev->vq_index + i,
+ };
+ int r;
+ bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
diff --git a/debian/patches/extra/0018-hw-virtio-Replace-g_memdup-by-g_memdup2.patch b/debian/patches/extra/0018-hw-virtio-Replace-g_memdup-by-g_memdup2.patch
new file mode 100644
index 0000000..12ea0ad
--- /dev/null
+++ b/debian/patches/extra/0018-hw-virtio-Replace-g_memdup-by-g_memdup2.patch
@@ -0,0 +1,74 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd at redhat.com>
+Date: Thu, 12 May 2022 19:57:46 +0200
+Subject: [PATCH] hw/virtio: Replace g_memdup() by g_memdup2()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Per https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538
+
+ The old API took the size of the memory to duplicate as a guint,
+ whereas most memory functions take memory sizes as a gsize. This
+ made it easy to accidentally pass a gsize to g_memdup(). For large
+ values, that would lead to a silent truncation of the size from 64
+ to 32 bits, and result in a heap area being returned which is
+ significantly smaller than what the caller expects. This can likely
+ be exploited in various modules to cause a heap buffer overflow.
+
+Replace g_memdup() by the safer g_memdup2() wrapper.
+
+Acked-by: Jason Wang <jasowang at redhat.com>
+Acked-by: Eugenio Pérez <eperezma at redhat.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd at redhat.com>
+Message-Id: <20220512175747.142058-6-eperezma at redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst at redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst at redhat.com>
+(cherry-picked from commit d792199de55ca5cb5334016884039c740290b5c7)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/net/virtio-net.c | 3 ++-
+ hw/virtio/virtio-crypto.c | 6 +++---
+ 2 files changed, 5 insertions(+), 4 deletions(-)
+
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+index 1067e72b39..e4748a7e6c 100644
+--- a/hw/net/virtio-net.c
++++ b/hw/net/virtio-net.c
+@@ -1443,7 +1443,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+ }
+
+ iov_cnt = elem->out_num;
+- iov2 = iov = g_memdup(elem->out_sg, sizeof(struct iovec) * elem->out_num);
++ iov2 = iov = g_memdup2(elem->out_sg,
++ sizeof(struct iovec) * elem->out_num);
+ s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl));
+ iov_discard_front(&iov, &iov_cnt, sizeof(ctrl));
+ if (s != sizeof(ctrl)) {
+diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
+index dcd80b904d..0e31e3cc04 100644
+--- a/hw/virtio/virtio-crypto.c
++++ b/hw/virtio/virtio-crypto.c
+@@ -242,7 +242,7 @@ static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+ }
+
+ out_num = elem->out_num;
+- out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
++ out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
+ out_iov = out_iov_copy;
+
+ in_num = elem->in_num;
+@@ -605,11 +605,11 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
+ }
+
+ out_num = elem->out_num;
+- out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num);
++ out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
+ out_iov = out_iov_copy;
+
+ in_num = elem->in_num;
+- in_iov_copy = g_memdup(elem->in_sg, sizeof(in_iov[0]) * in_num);
++ in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num);
+ in_iov = in_iov_copy;
+
+ if (unlikely(iov_to_buf(out_iov, out_num, 0, &req, sizeof(req))
diff --git a/debian/patches/extra/0019-vhost-Fix-element-in-vhost_svq_add-failure.patch b/debian/patches/extra/0019-vhost-Fix-element-in-vhost_svq_add-failure.patch
new file mode 100644
index 0000000..daa5bca
--- /dev/null
+++ b/debian/patches/extra/0019-vhost-Fix-element-in-vhost_svq_add-failure.patch
@@ -0,0 +1,47 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma at redhat.com>
+Date: Thu, 12 May 2022 19:57:47 +0200
+Subject: [PATCH] vhost: Fix element in vhost_svq_add failure
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Coverity rightly reports that is not free in that case.
+
+Fixes: Coverity CID 1487559
+Fixes: 100890f7ca ("vhost: Shadow virtqueue buffers forwarding")
+
+Signed-off-by: Eugenio Pérez <eperezma at redhat.com>
+Message-Id: <20220512175747.142058-7-eperezma at redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst at redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst at redhat.com>
+(cherry-picked from commit 5181db132b587754dda3a520eec923b87a65bbb7)
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index 31fc50907d..06d0bb39d9 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -199,11 +199,19 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+ return true;
+ }
+
++/**
++ * Add an element to a SVQ.
++ *
++ * The caller must check that there is enough slots for the new element. It
++ * takes ownership of the element: In case of failure, it is free and the SVQ
++ * is considered broken.
++ */
+ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+ {
+ unsigned qemu_head;
+ bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+ if (unlikely(!ok)) {
++ g_free(elem);
+ return false;
+ }
+
diff --git a/debian/patches/series b/debian/patches/series
index 6fa7ca8..3850a52 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,5 +1,22 @@
extra/0001-monitor-qmp-fix-race-with-clients-disconnecting-earl.patch
-extra/0002-block-gluster-correctly-set-max_pdiscard-which-is-in.patch
+extra/0002-block-gluster-correctly-set-max_pdiscard.patch
+extra/0003-block-vmdk-Fix-reopening-bs-file.patch
+extra/0004-linux-aio-fix-unbalanced-plugged-counter-in-laio_io_.patch
+extra/0005-pci-fix-overflow-in-snprintf-string-formatting.patch
+extra/0006-target-i386-kvm-Fix-disabling-MPX-on-cpu-host-with-M.patch
+extra/0007-coroutine-ucontext-use-QEMU_DEFINE_STATIC_CO_TLS.patch
+extra/0008-coroutine-use-QEMU_DEFINE_STATIC_CO_TLS.patch
+extra/0009-coroutine-Rename-qemu_coroutine_inc-dec_pool_size.patch
+extra/0010-coroutine-Revert-to-constant-batch-size.patch
+extra/0011-target-i386-do-not-consult-nonexistent-host-leaves.patch
+extra/0012-virtio-scsi-fix-ctrl-and-event-handler-functions-in-.patch
+extra/0013-virtio-scsi-don-t-waste-CPU-polling-the-event-virtqu.patch
+extra/0014-vhost-Track-descriptor-chain-in-private-at-SVQ.patch
+extra/0015-vhost-Fix-device-s-used-descriptor-dequeue.patch
+extra/0016-vdpa-Fix-bad-index-calculus-at-vhost_vdpa_get_vring_.patch
+extra/0017-vdpa-Fix-index-calculus-at-vhost_vdpa_svqs_start.patch
+extra/0018-hw-virtio-Replace-g_memdup-by-g_memdup2.patch
+extra/0019-vhost-Fix-element-in-vhost_svq_add-failure.patch
bitmap-mirror/0001-drive-mirror-add-support-for-sync-bitmap-mode-never.patch
bitmap-mirror/0002-drive-mirror-add-support-for-conditional-and-always-.patch
bitmap-mirror/0003-mirror-add-check-for-bitmap-mode-without-bitmap.patch
--
2.30.2
More information about the pve-devel
mailing list