[pve-devel] [PATCH kernel] cherry-pick fix for RCU stall issue after VM live migration
Fiona Ebner
f.ebner at proxmox.com
Mon Nov 27 18:39:48 CET 2023
caused by a lapic timer interrupt getting lost.
Already queued for 6.5.13:
https://lore.kernel.org/stable/20231124172031.920738810@linuxfoundation.org/
Reported in the community forum:
https://forum.proxmox.com/threads/136992/
Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
---
...c-timer-interrupt-lost-after-loading.patch | 126 ++++++++++++++++++
1 file changed, 126 insertions(+)
create mode 100644 patches/kernel/0017-KVM-x86-Fix-lapic-timer-interrupt-lost-after-loading.patch
diff --git a/patches/kernel/0017-KVM-x86-Fix-lapic-timer-interrupt-lost-after-loading.patch b/patches/kernel/0017-KVM-x86-Fix-lapic-timer-interrupt-lost-after-loading.patch
new file mode 100644
index 0000000..ea8bff6
--- /dev/null
+++ b/patches/kernel/0017-KVM-x86-Fix-lapic-timer-interrupt-lost-after-loading.patch
@@ -0,0 +1,126 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Greg Kroah-Hartman <gregkh at linuxfoundation.org>
+Date: Fri, 24 Nov 2023 17:48:01 +0000
+Subject: [PATCH] KVM: x86: Fix lapic timer interrupt lost after loading a
+ snapshot.
+
+commit 9cfec6d097c607e36199cf0cfbb8cf5acbd8e9b2 upstream.
+
+When running android emulator (which is based on QEMU 2.12) on
+certain Intel hosts with kernel version 6.3-rc1 or above, guest
+will freeze after loading a snapshot. This is almost 100%
+reproducible. By default, the android emulator will use snapshot
+to speed up the next launching of the same android guest. So
+this breaks the android emulator badly.
+
+I tested QEMU 8.0.4 from Debian 12 with an Ubuntu 22.04 guest by
+running command "loadvm" after "savevm". The same issue is
+observed. At the same time, none of our AMD platforms is impacted.
+More experiments show that loading the KVM module with
+"enable_apicv=false" can workaround it.
+
+The issue started to show up after commit 8e6ed96cdd50 ("KVM: x86:
+fire timer when it is migrated and expired, and in oneshot mode").
+However, as is pointed out by Sean Christopherson, it is introduced
+by commit 967235d32032 ("KVM: vmx: clear pending interrupts on
+KVM_SET_LAPIC"). commit 8e6ed96cdd50 ("KVM: x86: fire timer when
+it is migrated and expired, and in oneshot mode") just makes it
+easier to hit the issue.
+
+Having both commits, the oneshot lapic timer gets fired immediately
+inside the KVM_SET_LAPIC call when loading the snapshot. On Intel
+platforms with APIC virtualization and posted interrupt processing,
+this eventually leads to setting the corresponding PIR bit. However,
+the whole PIR bits get cleared later in the same KVM_SET_LAPIC call
+by apicv_post_state_restore. This leads to timer interrupt lost.
+
+The fix is to move vmx_apicv_post_state_restore to the beginning of
+the KVM_SET_LAPIC call and rename to vmx_apicv_pre_state_restore.
+What vmx_apicv_post_state_restore does is actually clearing any
+former apicv state and this behavior is more suitable to carry out
+in the beginning.
+
+Fixes: 967235d32032 ("KVM: vmx: clear pending interrupts on KVM_SET_LAPIC")
+Cc: stable at vger.kernel.org
+Suggested-by: Sean Christopherson <seanjc at google.com>
+Signed-off-by: Haitao Shan <hshan at google.com>
+Link: https://lore.kernel.org/r/20230913000215.478387-1-hshan@google.com
+Signed-off-by: Sean Christopherson <seanjc at google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh at linuxfoundation.org>
+(picked from https://lore.kernel.org/stable/20231124172031.920738810@linuxfoundation.org/)
+Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
+---
+ arch/x86/include/asm/kvm-x86-ops.h | 1 +
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/lapic.c | 4 ++++
+ arch/x86/kvm/vmx/vmx.c | 4 ++--
+ 4 files changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
+index e3054e3e46d52..9b419f0de713c 100644
+--- a/arch/x86/include/asm/kvm-x86-ops.h
++++ b/arch/x86/include/asm/kvm-x86-ops.h
+@@ -108,6 +108,7 @@ KVM_X86_OP_OPTIONAL(vcpu_blocking)
+ KVM_X86_OP_OPTIONAL(vcpu_unblocking)
+ KVM_X86_OP_OPTIONAL(pi_update_irte)
+ KVM_X86_OP_OPTIONAL(pi_start_assignment)
++KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
+ KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
+ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
+ KVM_X86_OP_OPTIONAL(set_hv_timer)
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index f72b30d2238a6..9bdbb1cc03d38 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1690,6 +1690,7 @@ struct kvm_x86_ops {
+ int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+ void (*pi_start_assignment)(struct kvm *kvm);
++ void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
+ void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+ bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
+
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index e74e223f46aa3..a3d488608b85d 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -2649,6 +2649,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+ u64 msr_val;
+ int i;
+
++ static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
++
+ if (!init_event) {
+ msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+@@ -2960,6 +2962,8 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
+
++ static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
++
+ kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index bc6f0fea48b43..52af279f793db 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6909,7 +6909,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+ }
+
+-static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
++static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+@@ -8275,7 +8275,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+- .apicv_post_state_restore = vmx_apicv_post_state_restore,
++ .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
--
2.39.2
More information about the pve-devel
mailing list