[pve-devel] [PATCH qemu] cherry-pick TCG-related stable fixes for 7.2

Fri Mar 17 13:47:11 CET 2023

When turning off the "KVM hardware virtualization" checkbox in Proxmox
VE, the TCG accelerator is used, so these fixes are relevant then.

The first patch is included to allow cherry-picking the others without
changes.

Reported-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
---
 ...Introduce-and-use-reg_t-consistently.patch | 286 ++++++++++++++++++
 ...25-target-i386-Fix-BEXTR-instruction.patch |  97 ++++++
 ...i386-Fix-C-flag-for-BLSI-BLSMSK-BLSR.patch |  47 +++
 ...arget-i386-fix-ADOX-followed-by-ADCX.patch | 192 ++++++++++++
 ...028-target-i386-Fix-BZHI-instruction.patch |  64 ++++
 debian/patches/series                         |   5 +
 6 files changed, 691 insertions(+)
 create mode 100644 debian/patches/extra/0024-tests-tcg-i386-Introduce-and-use-reg_t-consistently.patch
 create mode 100644 debian/patches/extra/0025-target-i386-Fix-BEXTR-instruction.patch
 create mode 100644 debian/patches/extra/0026-target-i386-Fix-C-flag-for-BLSI-BLSMSK-BLSR.patch
 create mode 100644 debian/patches/extra/0027-target-i386-fix-ADOX-followed-by-ADCX.patch
 create mode 100644 debian/patches/extra/0028-target-i386-Fix-BZHI-instruction.patch

diff --git a/debian/patches/extra/0024-tests-tcg-i386-Introduce-and-use-reg_t-consistently.patch b/debian/patches/extra/0024-tests-tcg-i386-Introduce-and-use-reg_t-consistently.patch
new file mode 100644
index 0000000..a4bcb71
--- /dev/null
+++ b/debian/patches/extra/0024-tests-tcg-i386-Introduce-and-use-reg_t-consistently.patch
@@ -0,0 +1,286 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Richard Henderson <richard.henderson at linaro.org>
+Date: Sat, 14 Jan 2023 13:05:41 -1000
+Subject: [PATCH] tests/tcg/i386: Introduce and use reg_t consistently
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Define reg_t based on the actual register width.
+Define the inlines using that type.  This will allow
+input registers to 32-bit insns to be set to 64-bit
+values on x86-64, which allows testing various edge cases.
+
+Signed-off-by: Richard Henderson <richard.henderson at linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd at linaro.org>
+Message-Id: <20230114230542.3116013-2-richard.henderson at linaro.org>
+Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
+(cherry-picked from commit 5d62d6649cd367b5b4a3676e7514d2f9ca86cb03)
+Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
+---
+ tests/tcg/i386/test-i386-bmi2.c | 182 ++++++++++++++++----------------
+ 1 file changed, 93 insertions(+), 89 deletions(-)
+
+diff --git a/tests/tcg/i386/test-i386-bmi2.c b/tests/tcg/i386/test-i386-bmi2.c
+index 5fadf47510..3c3ef85513 100644
+--- a/tests/tcg/i386/test-i386-bmi2.c
++++ b/tests/tcg/i386/test-i386-bmi2.c
+@@ -3,34 +3,40 @@
+ #include <stdint.h>
+ #include <stdio.h>
+ 
++#ifdef __x86_64
++typedef uint64_t reg_t;
++#else
++typedef uint32_t reg_t;
++#endif
++
+ #define insn1q(name, arg0)                                                           \
+-static inline uint64_t name##q(uint64_t arg0)                                        \
++static inline reg_t name##q(reg_t arg0)                                              \
+ {                                                                                    \
+-    uint64_t result64;                                                               \
++    reg_t result64;                                                                  \
+     asm volatile (#name "q   %1, %0" : "=r"(result64) : "rm"(arg0));                 \
+     return result64;                                                                 \
+ }
+ 
+ #define insn1l(name, arg0)                                                           \
+-static inline uint32_t name##l(uint32_t arg0)                                        \
++static inline reg_t name##l(reg_t arg0)                                              \
+ {                                                                                    \
+-    uint32_t result32;                                                               \
++    reg_t result32;                                                                  \
+     asm volatile (#name "l   %k1, %k0" : "=r"(result32) : "rm"(arg0));               \
+     return result32;                                                                 \
+ }
+ 
+ #define insn2q(name, arg0, c0, arg1, c1)                                             \
+-static inline uint64_t name##q(uint64_t arg0, uint64_t arg1)                         \
++static inline reg_t name##q(reg_t arg0, reg_t arg1)                                  \
+ {                                                                                    \
+-    uint64_t result64;                                                               \
++    reg_t result64;                                                                  \
+     asm volatile (#name "q   %2, %1, %0" : "=r"(result64) : c0(arg0), c1(arg1));     \
+     return result64;                                                                 \
+ }
+ 
+ #define insn2l(name, arg0, c0, arg1, c1)                                             \
+-static inline uint32_t name##l(uint32_t arg0, uint32_t arg1)                         \
++static inline reg_t name##l(reg_t arg0, reg_t arg1)                                  \
+ {                                                                                    \
+-    uint32_t result32;                                                               \
++    reg_t result32;                                                                  \
+     asm volatile (#name "l   %k2, %k1, %k0" : "=r"(result32) : c0(arg0), c1(arg1));  \
+     return result32;                                                                 \
+ }
+@@ -65,130 +71,128 @@ insn1l(blsr, src)
+ int main(int argc, char *argv[]) {
+     uint64_t ehlo = 0x202020204f4c4845ull;
+     uint64_t mask = 0xa080800302020001ull;
+-    uint32_t result32;
++    reg_t result;
+ 
+ #ifdef __x86_64
+-    uint64_t result64;
+-
+     /* 64 bits */
+-    result64 = andnq(mask, ehlo);
+-    assert(result64 == 0x002020204d4c4844);
++    result = andnq(mask, ehlo);
++    assert(result == 0x002020204d4c4844);
+ 
+-    result64 = pextq(ehlo, mask);
+-    assert(result64 == 133);
++    result = pextq(ehlo, mask);
++    assert(result == 133);
+ 
+-    result64 = pdepq(result64, mask);
+-    assert(result64 == (ehlo & mask));
++    result = pdepq(result, mask);
++    assert(result == (ehlo & mask));
+ 
+-    result64 = pextq(-1ull, mask);
+-    assert(result64 == 511); /* mask has 9 bits set */
++    result = pextq(-1ull, mask);
++    assert(result == 511); /* mask has 9 bits set */
+ 
+-    result64 = pdepq(-1ull, mask);
+-    assert(result64 == mask);
++    result = pdepq(-1ull, mask);
++    assert(result == mask);
+ 
+-    result64 = bextrq(mask, 0x3f00);
+-    assert(result64 == (mask & ~INT64_MIN));
++    result = bextrq(mask, 0x3f00);
++    assert(result == (mask & ~INT64_MIN));
+ 
+-    result64 = bextrq(mask, 0x1038);
+-    assert(result64 == 0xa0);
++    result = bextrq(mask, 0x1038);
++    assert(result == 0xa0);
+ 
+-    result64 = bextrq(mask, 0x10f8);
+-    assert(result64 == 0);
++    result = bextrq(mask, 0x10f8);
++    assert(result == 0);
+ 
+-    result64 = blsiq(0x30);
+-    assert(result64 == 0x10);
++    result = blsiq(0x30);
++    assert(result == 0x10);
+ 
+-    result64 = blsiq(0x30ull << 32);
+-    assert(result64 == 0x10ull << 32);
++    result = blsiq(0x30ull << 32);
++    assert(result == 0x10ull << 32);
+ 
+-    result64 = blsmskq(0x30);
+-    assert(result64 == 0x1f);
++    result = blsmskq(0x30);
++    assert(result == 0x1f);
+ 
+-    result64 = blsrq(0x30);
+-    assert(result64 == 0x20);
++    result = blsrq(0x30);
++    assert(result == 0x20);
+ 
+-    result64 = blsrq(0x30ull << 32);
+-    assert(result64 == 0x20ull << 32);
++    result = blsrq(0x30ull << 32);
++    assert(result == 0x20ull << 32);
+ 
+-    result64 = bzhiq(mask, 0x3f);
+-    assert(result64 == (mask & ~INT64_MIN));
++    result = bzhiq(mask, 0x3f);
++    assert(result == (mask & ~INT64_MIN));
+ 
+-    result64 = bzhiq(mask, 0x1f);
+-    assert(result64 == (mask & ~(-1 << 30)));
++    result = bzhiq(mask, 0x1f);
++    assert(result == (mask & ~(-1 << 30)));
+ 
+-    result64 = rorxq(0x2132435465768798, 8);
+-    assert(result64 == 0x9821324354657687);
++    result = rorxq(0x2132435465768798, 8);
++    assert(result == 0x9821324354657687);
+ 
+-    result64 = sarxq(0xffeeddccbbaa9988, 8);
+-    assert(result64 == 0xffffeeddccbbaa99);
++    result = sarxq(0xffeeddccbbaa9988, 8);
++    assert(result == 0xffffeeddccbbaa99);
+ 
+-    result64 = sarxq(0x77eeddccbbaa9988, 8 | 64);
+-    assert(result64 == 0x0077eeddccbbaa99);
++    result = sarxq(0x77eeddccbbaa9988, 8 | 64);
++    assert(result == 0x0077eeddccbbaa99);
+ 
+-    result64 = shrxq(0xffeeddccbbaa9988, 8);
+-    assert(result64 == 0x00ffeeddccbbaa99);
++    result = shrxq(0xffeeddccbbaa9988, 8);
++    assert(result == 0x00ffeeddccbbaa99);
+ 
+-    result64 = shrxq(0x77eeddccbbaa9988, 8 | 192);
+-    assert(result64 == 0x0077eeddccbbaa99);
++    result = shrxq(0x77eeddccbbaa9988, 8 | 192);
++    assert(result == 0x0077eeddccbbaa99);
+ 
+-    result64 = shlxq(0xffeeddccbbaa9988, 8);
+-    assert(result64 == 0xeeddccbbaa998800);
++    result = shlxq(0xffeeddccbbaa9988, 8);
++    assert(result == 0xeeddccbbaa998800);
+ #endif
+ 
+     /* 32 bits */
+-    result32 = andnl(mask, ehlo);
+-    assert(result32 == 0x04d4c4844);
++    result = andnl(mask, ehlo);
++    assert(result == 0x04d4c4844);
+ 
+-    result32 = pextl((uint32_t) ehlo, mask);
+-    assert(result32 == 5);
++    result = pextl((uint32_t) ehlo, mask);
++    assert(result == 5);
+ 
+-    result32 = pdepl(result32, mask);
+-    assert(result32 == (uint32_t)(ehlo & mask));
++    result = pdepl(result, mask);
++    assert(result == (uint32_t)(ehlo & mask));
+ 
+-    result32 = pextl(-1u, mask);
+-    assert(result32 == 7); /* mask has 3 bits set */
++    result = pextl(-1u, mask);
++    assert(result == 7); /* mask has 3 bits set */
+ 
+-    result32 = pdepl(-1u, mask);
+-    assert(result32 == (uint32_t)mask);
++    result = pdepl(-1u, mask);
++    assert(result == (uint32_t)mask);
+ 
+-    result32 = bextrl(mask, 0x1f00);
+-    assert(result32 == (mask & ~INT32_MIN));
++    result = bextrl(mask, 0x1f00);
++    assert(result == (mask & ~INT32_MIN));
+ 
+-    result32 = bextrl(ehlo, 0x1018);
+-    assert(result32 == 0x4f);
++    result = bextrl(ehlo, 0x1018);
++    assert(result == 0x4f);
+ 
+-    result32 = bextrl(mask, 0x1038);
+-    assert(result32 == 0);
++    result = bextrl(mask, 0x1038);
++    assert(result == 0);
+ 
+-    result32 = blsil(0xffff);
+-    assert(result32 == 1);
++    result = blsil(0xffff);
++    assert(result == 1);
+ 
+-    result32 = blsmskl(0x300);
+-    assert(result32 == 0x1ff);
++    result = blsmskl(0x300);
++    assert(result == 0x1ff);
+ 
+-    result32 = blsrl(0xffc);
+-    assert(result32 == 0xff8);
++    result = blsrl(0xffc);
++    assert(result == 0xff8);
+ 
+-    result32 = bzhil(mask, 0xf);
+-    assert(result32 == 1);
++    result = bzhil(mask, 0xf);
++    assert(result == 1);
+ 
+-    result32 = rorxl(0x65768798, 8);
+-    assert(result32 == 0x98657687);
++    result = rorxl(0x65768798, 8);
++    assert(result == 0x98657687);
+ 
+-    result32 = sarxl(0xffeeddcc, 8);
+-    assert(result32 == 0xffffeedd);
++    result = sarxl(0xffeeddcc, 8);
++    assert(result == 0xffffeedd);
+ 
+-    result32 = sarxl(0x77eeddcc, 8 | 32);
+-    assert(result32 == 0x0077eedd);
++    result = sarxl(0x77eeddcc, 8 | 32);
++    assert(result == 0x0077eedd);
+ 
+-    result32 = shrxl(0xffeeddcc, 8);
+-    assert(result32 == 0x00ffeedd);
++    result = shrxl(0xffeeddcc, 8);
++    assert(result == 0x00ffeedd);
+ 
+-    result32 = shrxl(0x77eeddcc, 8 | 128);
+-    assert(result32 == 0x0077eedd);
++    result = shrxl(0x77eeddcc, 8 | 128);
++    assert(result == 0x0077eedd);
+ 
+-    result32 = shlxl(0xffeeddcc, 8);
+-    assert(result32 == 0xeeddcc00);
++    result = shlxl(0xffeeddcc, 8);
++    assert(result == 0xeeddcc00);
+ 
+     return 0;
+ }
diff --git a/debian/patches/extra/0025-target-i386-Fix-BEXTR-instruction.patch b/debian/patches/extra/0025-target-i386-Fix-BEXTR-instruction.patch
new file mode 100644
index 0000000..38282b2
--- /dev/null
+++ b/debian/patches/extra/0025-target-i386-Fix-BEXTR-instruction.patch
@@ -0,0 +1,97 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Richard Henderson <richard.henderson at linaro.org>
+Date: Sat, 14 Jan 2023 13:05:42 -1000
+Subject: [PATCH] target/i386: Fix BEXTR instruction
+
+There were two problems here: not limiting the input to operand bits,
+and not correctly handling large extraction length.
+
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1372
+Signed-off-by: Richard Henderson <richard.henderson at linaro.org>
+Message-Id: <20230114230542.3116013-3-richard.henderson at linaro.org>
+Cc: qemu-stable at nongnu.org
+Fixes: 1d0b926150e5 ("target/i386: move scalar 0F 38 and 0F 3A instruction to new decoder", 2022-10-18)
+Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
+(cherry-picked from commit b14c0098975264ed03144f145bca0179a6763a07)
+Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
+---
+ target/i386/tcg/emit.c.inc      | 22 +++++++++++-----------
+ tests/tcg/i386/test-i386-bmi2.c | 12 ++++++++++++
+ 2 files changed, 23 insertions(+), 11 deletions(-)
+
+diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
+index 7037ff91c6..99f6ba6e19 100644
+--- a/target/i386/tcg/emit.c.inc
++++ b/target/i386/tcg/emit.c.inc
+@@ -1078,30 +1078,30 @@ static void gen_ANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+ static void gen_BEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+ {
+     MemOp ot = decode->op[0].ot;
+-    TCGv bound, zero;
++    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
++    TCGv zero = tcg_constant_tl(0);
++    TCGv mone = tcg_constant_tl(-1);
+ 
+     /*
+      * Extract START, and shift the operand.
+      * Shifts larger than operand size get zeros.
+      */
+     tcg_gen_ext8u_tl(s->A0, s->T1);
++    if (TARGET_LONG_BITS == 64 && ot == MO_32) {
++        tcg_gen_ext32u_tl(s->T0, s->T0);
++    }
+     tcg_gen_shr_tl(s->T0, s->T0, s->A0);
+ 
+-    bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
+-    zero = tcg_constant_tl(0);
+     tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, s->T0, zero);
+ 
+     /*
+-     * Extract the LEN into a mask.  Lengths larger than
+-     * operand size get all ones.
++     * Extract the LEN into an inverse mask.  Lengths larger than
++     * operand size get all zeros, length 0 gets all ones.
+      */
+     tcg_gen_extract_tl(s->A0, s->T1, 8, 8);
+-    tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->A0, bound, s->A0, bound);
+-
+-    tcg_gen_movi_tl(s->T1, 1);
+-    tcg_gen_shl_tl(s->T1, s->T1, s->A0);
+-    tcg_gen_subi_tl(s->T1, s->T1, 1);
+-    tcg_gen_and_tl(s->T0, s->T0, s->T1);
++    tcg_gen_shl_tl(s->T1, mone, s->A0);
++    tcg_gen_movcond_tl(TCG_COND_LEU, s->T1, s->A0, bound, s->T1, zero);
++    tcg_gen_andc_tl(s->T0, s->T0, s->T1);
+ 
+     gen_op_update1_cc(s);
+     set_cc_op(s, CC_OP_LOGICB + ot);
+diff --git a/tests/tcg/i386/test-i386-bmi2.c b/tests/tcg/i386/test-i386-bmi2.c
+index 3c3ef85513..982d4abda4 100644
+--- a/tests/tcg/i386/test-i386-bmi2.c
++++ b/tests/tcg/i386/test-i386-bmi2.c
+@@ -99,6 +99,9 @@ int main(int argc, char *argv[]) {
+     result = bextrq(mask, 0x10f8);
+     assert(result == 0);
+ 
++    result = bextrq(0xfedcba9876543210ull, 0x7f00);
++    assert(result == 0xfedcba9876543210ull);
++
+     result = blsiq(0x30);
+     assert(result == 0x10);
+ 
+@@ -164,6 +167,15 @@ int main(int argc, char *argv[]) {
+     result = bextrl(mask, 0x1038);
+     assert(result == 0);
+ 
++    result = bextrl((reg_t)0x8f635a775ad3b9b4ull, 0x3018);
++    assert(result == 0x5a);
++
++    result = bextrl((reg_t)0xfedcba9876543210ull, 0x7f00);
++    assert(result == 0x76543210u);
++
++    result = bextrl(-1, 0);
++    assert(result == 0);
++
+     result = blsil(0xffff);
+     assert(result == 1);
+ 
diff --git a/debian/patches/extra/0026-target-i386-Fix-C-flag-for-BLSI-BLSMSK-BLSR.patch b/debian/patches/extra/0026-target-i386-Fix-C-flag-for-BLSI-BLSMSK-BLSR.patch
new file mode 100644
index 0000000..c743d55
--- /dev/null
+++ b/debian/patches/extra/0026-target-i386-Fix-C-flag-for-BLSI-BLSMSK-BLSR.patch
@@ -0,0 +1,47 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Richard Henderson <richard.henderson at linaro.org>
+Date: Sat, 14 Jan 2023 08:06:01 -1000
+Subject: [PATCH] target/i386: Fix C flag for BLSI, BLSMSK, BLSR
+
+We forgot to set cc_src, which is used for computing C.
+
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1370
+Signed-off-by: Richard Henderson <richard.henderson at linaro.org>
+Message-Id: <20230114180601.2993644-1-richard.henderson at linaro.org>
+Cc: qemu-stable at nongnu.org
+Fixes: 1d0b926150e5 ("target/i386: move scalar 0F 38 and 0F 3A instruction to new decoder", 2022-10-18)
+Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
+(cherry-picked from commit 99282098dc74c2055bde5652bde6cf0067d0c370)
+Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
+---
+ target/i386/tcg/emit.c.inc | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
+index 99f6ba6e19..4d7702c106 100644
+--- a/target/i386/tcg/emit.c.inc
++++ b/target/i386/tcg/emit.c.inc
+@@ -1111,6 +1111,7 @@ static void gen_BLSI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+ {
+     MemOp ot = decode->op[0].ot;
+ 
++    tcg_gen_mov_tl(cpu_cc_src, s->T0);
+     tcg_gen_neg_tl(s->T1, s->T0);
+     tcg_gen_and_tl(s->T0, s->T0, s->T1);
+     tcg_gen_mov_tl(cpu_cc_dst, s->T0);
+@@ -1121,6 +1122,7 @@ static void gen_BLSMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode
+ {
+     MemOp ot = decode->op[0].ot;
+ 
++    tcg_gen_mov_tl(cpu_cc_src, s->T0);
+     tcg_gen_subi_tl(s->T1, s->T0, 1);
+     tcg_gen_xor_tl(s->T0, s->T0, s->T1);
+     tcg_gen_mov_tl(cpu_cc_dst, s->T0);
+@@ -1131,6 +1133,7 @@ static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+ {
+     MemOp ot = decode->op[0].ot;
+ 
++    tcg_gen_mov_tl(cpu_cc_src, s->T0);
+     tcg_gen_subi_tl(s->T1, s->T0, 1);
+     tcg_gen_and_tl(s->T0, s->T0, s->T1);
+     tcg_gen_mov_tl(cpu_cc_dst, s->T0);
diff --git a/debian/patches/extra/0027-target-i386-fix-ADOX-followed-by-ADCX.patch b/debian/patches/extra/0027-target-i386-fix-ADOX-followed-by-ADCX.patch
new file mode 100644
index 0000000..bb108e5
--- /dev/null
+++ b/debian/patches/extra/0027-target-i386-fix-ADOX-followed-by-ADCX.patch
@@ -0,0 +1,192 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini at redhat.com>
+Date: Tue, 31 Jan 2023 09:48:03 +0100
+Subject: [PATCH] target/i386: fix ADOX followed by ADCX
+
+When ADCX is followed by ADOX or vice versa, the second instruction's
+carry comes from EFLAGS and the condition codes use the CC_OP_ADCOX
+operation.  Retrieving the carry from EFLAGS is handled by this bit
+of gen_ADCOX:
+
+        tcg_gen_extract_tl(carry_in, cpu_cc_src,
+            ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
+
+Unfortunately, in this case cc_op has been overwritten by the previous
+"if" statement to CC_OP_ADCOX.  This works by chance when the first
+instruction is ADCX; however, if the first instruction is ADOX,
+ADCX will incorrectly take its carry from OF instead of CF.
+
+Fix by moving the computation of the new cc_op at the end of the function.
+The included exhaustive test case fails without this patch and passes
+afterwards.
+
+Because ADCX/ADOX need not be invoked through the VEX prefix, this
+regression bisects to commit 16fc5726a6e2 ("target/i386: reimplement
+0x0f 0x38, add AVX", 2022-10-18).  However, the mistake happened a
+little earlier, when BMI instructions were rewritten using the new
+decoder framework.
+
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1471
+Reported-by: Paul Jolly <https://gitlab.com/myitcv>
+Fixes: 1d0b926150e5 ("target/i386: move scalar 0F 38 and 0F 3A instruction to new decoder", 2022-10-18)
+Cc: qemu-stable at nongnu.org
+Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
+(cherry-picked from commit 60c7dd22e1383754d5f150bc9f7c2785c662a7b6)
+Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
+---
+ target/i386/tcg/emit.c.inc       | 20 +++++----
+ tests/tcg/i386/Makefile.target   |  6 ++-
+ tests/tcg/i386/test-i386-adcox.c | 75 ++++++++++++++++++++++++++++++++
+ 3 files changed, 91 insertions(+), 10 deletions(-)
+ create mode 100644 tests/tcg/i386/test-i386-adcox.c
+
+diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
+index 4d7702c106..0d7c6e80ae 100644
+--- a/target/i386/tcg/emit.c.inc
++++ b/target/i386/tcg/emit.c.inc
+@@ -1015,6 +1015,7 @@ VSIB_AVX(VPGATHERQ, vpgatherq)
+ 
+ static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
+ {
++    int opposite_cc_op;
+     TCGv carry_in = NULL;
+     TCGv carry_out = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
+     TCGv zero;
+@@ -1022,14 +1023,8 @@ static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
+     if (cc_op == s->cc_op || s->cc_op == CC_OP_ADCOX) {
+         /* Re-use the carry-out from a previous round.  */
+         carry_in = carry_out;
+-        cc_op = s->cc_op;
+-    } else if (s->cc_op == CC_OP_ADCX || s->cc_op == CC_OP_ADOX) {
+-        /* Merge with the carry-out from the opposite instruction.  */
+-        cc_op = CC_OP_ADCOX;
+-    }
+-
+-    /* If we don't have a carry-in, get it out of EFLAGS.  */
+-    if (!carry_in) {
++    } else {
++        /* We don't have a carry-in, get it out of EFLAGS.  */
+         if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
+             gen_compute_eflags(s);
+         }
+@@ -1053,7 +1048,14 @@ static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
+         tcg_gen_add2_tl(s->T0, carry_out, s->T0, carry_out, s->T1, zero);
+         break;
+     }
+-    set_cc_op(s, cc_op);
++
++    opposite_cc_op = cc_op == CC_OP_ADCX ? CC_OP_ADOX : CC_OP_ADCX;
++    if (s->cc_op == CC_OP_ADCOX || s->cc_op == opposite_cc_op) {
++        /* Merge with the carry-out from the opposite instruction.  */
++        set_cc_op(s, CC_OP_ADCOX);
++    } else {
++        set_cc_op(s, cc_op);
++    }
+ }
+ 
+ static void gen_ADCX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target
+index 81831cafbc..bafd8c2180 100644
+--- a/tests/tcg/i386/Makefile.target
++++ b/tests/tcg/i386/Makefile.target
+@@ -14,7 +14,7 @@ config-cc.mak: Makefile
+ I386_SRCS=$(notdir $(wildcard $(I386_SRC)/*.c))
+ ALL_X86_TESTS=$(I386_SRCS:.c=)
+ SKIP_I386_TESTS=test-i386-ssse3 test-avx test-3dnow test-mmx
+-X86_64_TESTS:=$(filter test-i386-bmi2 $(SKIP_I386_TESTS), $(ALL_X86_TESTS))
++X86_64_TESTS:=$(filter test-i386-adcox test-i386-bmi2 $(SKIP_I386_TESTS), $(ALL_X86_TESTS))
+ 
+ test-i386-sse-exceptions: CFLAGS += -msse4.1 -mfpmath=sse
+ run-test-i386-sse-exceptions: QEMU_OPTS += -cpu max
+@@ -28,6 +28,10 @@ test-i386-bmi2: CFLAGS=-O2
+ run-test-i386-bmi2: QEMU_OPTS += -cpu max
+ run-plugin-test-i386-bmi2-%: QEMU_OPTS += -cpu max
+ 
++test-i386-adcox: CFLAGS=-O2
++run-test-i386-adcox: QEMU_OPTS += -cpu max
++run-plugin-test-i386-adcox-%: QEMU_OPTS += -cpu max
++
+ #
+ # hello-i386 is a barebones app
+ #
+diff --git a/tests/tcg/i386/test-i386-adcox.c b/tests/tcg/i386/test-i386-adcox.c
+new file mode 100644
+index 0000000000..16169efff8
+--- /dev/null
++++ b/tests/tcg/i386/test-i386-adcox.c
+@@ -0,0 +1,75 @@
++/* See if various BMI2 instructions give expected results */
++#include <assert.h>
++#include <stdint.h>
++#include <stdio.h>
++
++#define CC_C 1
++#define CC_O (1 << 11)
++
++#ifdef __x86_64__
++#define REG uint64_t
++#else
++#define REG uint32_t
++#endif
++
++void test_adox_adcx(uint32_t in_c, uint32_t in_o, REG adcx_operand, REG adox_operand)
++{
++    REG flags;
++    REG out_adcx, out_adox;
++
++    asm("pushf; pop %0" : "=r"(flags));
++    flags &= ~(CC_C | CC_O);
++    flags |= (in_c ? CC_C : 0);
++    flags |= (in_o ? CC_O : 0);
++
++    out_adcx = adcx_operand;
++    out_adox = adox_operand;
++    asm("push %0; popf;"
++        "adox %3, %2;"
++        "adcx %3, %1;"
++        "pushf; pop %0"
++        : "+r" (flags), "+r" (out_adcx), "+r" (out_adox)
++        : "r" ((REG)-1), "0" (flags), "1" (out_adcx), "2" (out_adox));
++
++    assert(out_adcx == in_c + adcx_operand - 1);
++    assert(out_adox == in_o + adox_operand - 1);
++    assert(!!(flags & CC_C) == (in_c || adcx_operand));
++    assert(!!(flags & CC_O) == (in_o || adox_operand));
++}
++
++void test_adcx_adox(uint32_t in_c, uint32_t in_o, REG adcx_operand, REG adox_operand)
++{
++    REG flags;
++    REG out_adcx, out_adox;
++
++    asm("pushf; pop %0" : "=r"(flags));
++    flags &= ~(CC_C | CC_O);
++    flags |= (in_c ? CC_C : 0);
++    flags |= (in_o ? CC_O : 0);
++
++    out_adcx = adcx_operand;
++    out_adox = adox_operand;
++    asm("push %0; popf;"
++        "adcx %3, %1;"
++        "adox %3, %2;"
++        "pushf; pop %0"
++        : "+r" (flags), "+r" (out_adcx), "+r" (out_adox)
++        : "r" ((REG)-1), "0" (flags), "1" (out_adcx), "2" (out_adox));
++
++    assert(out_adcx == in_c + adcx_operand - 1);
++    assert(out_adox == in_o + adox_operand - 1);
++    assert(!!(flags & CC_C) == (in_c || adcx_operand));
++    assert(!!(flags & CC_O) == (in_o || adox_operand));
++}
++
++int main(int argc, char *argv[]) {
++    /* try all combinations of input CF, input OF, CF from op1+op2,  OF from op2+op1 */
++    int i;
++    for (i = 0; i <= 15; i++) {
++        printf("%d\n", i);
++        test_adcx_adox(!!(i & 1), !!(i & 2), !!(i & 4), !!(i & 8));
++        test_adox_adcx(!!(i & 1), !!(i & 2), !!(i & 4), !!(i & 8));
++    }
++    return 0;
++}
++
diff --git a/debian/patches/extra/0028-target-i386-Fix-BZHI-instruction.patch b/debian/patches/extra/0028-target-i386-Fix-BZHI-instruction.patch
new file mode 100644
index 0000000..391817c
--- /dev/null
+++ b/debian/patches/extra/0028-target-i386-Fix-BZHI-instruction.patch
@@ -0,0 +1,64 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Richard Henderson <richard.henderson at linaro.org>
+Date: Sat, 14 Jan 2023 13:32:06 -1000
+Subject: [PATCH] target/i386: Fix BZHI instruction
+
+We did not correctly handle N >= operand size.
+
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1374
+Signed-off-by: Richard Henderson <richard.henderson at linaro.org>
+Message-Id: <20230114233206.3118472-1-richard.henderson at linaro.org>
+Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
+(cherry-picked from commit 9ad2ba6e8e7fc195d0dd0b76ab38bd2fceb1bdd4)
+Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
+---
+ target/i386/tcg/emit.c.inc      | 14 +++++++-------
+ tests/tcg/i386/test-i386-bmi2.c |  3 +++
+ 2 files changed, 10 insertions(+), 7 deletions(-)
+
+diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
+index 0d7c6e80ae..7296f3952c 100644
+--- a/target/i386/tcg/emit.c.inc
++++ b/target/i386/tcg/emit.c.inc
+@@ -1145,20 +1145,20 @@ static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+ static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+ {
+     MemOp ot = decode->op[0].ot;
+-    TCGv bound;
++    TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
++    TCGv zero = tcg_constant_tl(0);
++    TCGv mone = tcg_constant_tl(-1);
+ 
+-    tcg_gen_ext8u_tl(s->T1, cpu_regs[s->vex_v]);
+-    bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
++    tcg_gen_ext8u_tl(s->T1, s->T1);
+ 
+     /*
+      * Note that since we're using BMILG (in order to get O
+      * cleared) we need to store the inverse into C.
+      */
+-    tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src, s->T1, bound);
+-    tcg_gen_movcond_tl(TCG_COND_GT, s->T1, s->T1, bound, bound, s->T1);
++    tcg_gen_setcond_tl(TCG_COND_LEU, cpu_cc_src, s->T1, bound);
+ 
+-    tcg_gen_movi_tl(s->A0, -1);
+-    tcg_gen_shl_tl(s->A0, s->A0, s->T1);
++    tcg_gen_shl_tl(s->A0, mone, s->T1);
++    tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->T1, bound, s->A0, zero);
+     tcg_gen_andc_tl(s->T0, s->T0, s->A0);
+ 
+     gen_op_update1_cc(s);
+diff --git a/tests/tcg/i386/test-i386-bmi2.c b/tests/tcg/i386/test-i386-bmi2.c
+index 982d4abda4..0244df7987 100644
+--- a/tests/tcg/i386/test-i386-bmi2.c
++++ b/tests/tcg/i386/test-i386-bmi2.c
+@@ -123,6 +123,9 @@ int main(int argc, char *argv[]) {
+     result = bzhiq(mask, 0x1f);
+     assert(result == (mask & ~(-1 << 30)));
+ 
++    result = bzhiq(mask, 0x40);
++    assert(result == mask);
++
+     result = rorxq(0x2132435465768798, 8);
+     assert(result == 0x9821324354657687);
+ 
diff --git a/debian/patches/series b/debian/patches/series
index 70d525f..4e8ddd6 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -21,6 +21,11 @@ extra/0020-intel-iommu-fail-DEVIOTLB_UNMAP-without-dt-mode.patch
 extra/0021-memory-Allow-disabling-re-entrancy-checking-per-MR.patch
 extra/0022-lsi53c895a-disable-reentrancy-detection-for-script-R.patch
 extra/0023-acpi-cpuhp-fix-guest-visible-maximum-access-size-to-.patch
+extra/0024-tests-tcg-i386-Introduce-and-use-reg_t-consistently.patch
+extra/0025-target-i386-Fix-BEXTR-instruction.patch
+extra/0026-target-i386-Fix-C-flag-for-BLSI-BLSMSK-BLSR.patch
+extra/0027-target-i386-fix-ADOX-followed-by-ADCX.patch
+extra/0028-target-i386-Fix-BZHI-instruction.patch
 bitmap-mirror/0001-drive-mirror-add-support-for-sync-bitmap-mode-never.patch
 bitmap-mirror/0002-drive-mirror-add-support-for-conditional-and-always-.patch
 bitmap-mirror/0003-mirror-add-check-for-bitmap-mode-without-bitmap.patch
-- 
2.30.2