From b265ee7bae1192ecc55ecaf7e63c8ed84cc2f509 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Jun 2023 16:37:49 +1000 Subject: KVM: SEV: move set_dr_intercepts/clr_dr_intercepts from the header Static functions set_dr_intercepts() and clr_dr_intercepts() are only called from SVM so move them to .c. No functional change intended. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Carlos Bilbao Reviewed-by: Tom Lendacky Reviewed-by: Santosh Shukla Link: https://lore.kernel.org/r/20230615063757.3039121-2-aik@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 42 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 42 ------------------------------------------ 2 files changed, 42 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d381ad424554..42f17ae99c9e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -677,6 +677,48 @@ free_save_area: } +static void set_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + if (!sev_es_guest(svm->vcpu.kvm)) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + } + + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + + recalc_intercepts(svm); +} + +static void clr_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.intercepts[INTERCEPT_DR] = 0; + + /* DR7 access must remain intercepted for an SEV-ES guest */ + if (sev_es_guest(svm->vcpu.kvm)) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + } + + recalc_intercepts(svm); +} + static int direct_access_msr_slot(u32 msr) { u32 i; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 18af7e712a5a..800ca1776b59 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -404,48 +404,6 @@ static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u3 return test_bit(bit, (unsigned long *)&control->intercepts); } -static inline void set_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = svm->vmcb01.ptr; - - if (!sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); - } - - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - - recalc_intercepts(svm); -} - -static inline void clr_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = svm->vmcb01.ptr; - - vmcb->control.intercepts[INTERCEPT_DR] = 0; - - /* DR7 access must remain intercepted for an SEV-ES guest */ - if (sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - } - - recalc_intercepts(svm); -} - static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = svm->vmcb01.ptr; -- cgit From 29de732cc95cb5219d8c2bd145cb7293cb572fc3 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Jun 2023 16:37:50 +1000 Subject: KVM: SEV: Move SEV's GP_VECTOR intercept setup to SEV Currently SVM setup is done sequentially in init_vmcb() -> sev_init_vmcb() -> sev_es_init_vmcb() and tries keeping SVM/SEV/SEV-ES bits separated. One of the exceptions is #GP intercept which init_vmcb() skips setting for SEV guests and then sev_es_init_vmcb() needlessly clears it. Remove the SEV check from init_vmcb(). Clear the #GP intercept in sev_init_vmcb(). SEV-ES will use the SEV setting. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Alexey Kardashevskiy Reviewed-by: Carlos Bilbao Reviewed-by: Tom Lendacky Reviewed-by: Santosh Shukla Link: https://lore.kernel.org/r/20230615063757.3039121-3-aik@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 9 ++++++--- arch/x86/kvm/svm/svm.c | 5 ++--- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 07756b7348ae..fd6e316c1a23 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2974,9 +2974,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR4_WRITE); svm_set_intercept(svm, TRAP_CR8_WRITE); - /* No support for enable_vmware_backdoor */ - clr_exception_intercept(svm, GP_VECTOR); - /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); @@ -3002,6 +2999,12 @@ void sev_init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); + /* + * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as + * KVM can't decrypt guest memory to decode the faulting instruction. + */ + clr_exception_intercept(svm, GP_VECTOR); + if (sev_es_guest(svm->vcpu.kvm)) sev_es_init_vmcb(svm); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 42f17ae99c9e..a0e803f2a574 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1243,10 +1243,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. Don't intercept #GP for SEV guests as KVM can't - * decrypt guest memory to decode the faulting instruction. + * as VMware does. */ - if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) + if (enable_vmware_backdoor) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); -- cgit From f8d808ed1ba0019997f18ff2be0cf85e33a31022 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jun 2023 16:37:51 +1000 Subject: KVM: SVM: Rewrite sev_es_prepare_switch_to_guest()'s comment about swap types Rewrite the comment(s) in sev_es_prepare_switch_to_guest() to explain the swap types employed by the CPU for SEV-ES guests, i.e. to explain why KVM needs to save a seemingly random subset of host state, and to provide a decoder for the APM's Type-A/B/C terminology. Signed-off-by: Alexey Kardashevskiy Link: https://lore.kernel.org/r/20230615063757.3039121-4-aik@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index fd6e316c1a23..d58a25a92aa3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3023,19 +3023,24 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) { /* - * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. KVM performs the - * corresponding VMSAVE in svm_prepare_guest_switch for both - * traditional and SEV-ES guests. + * All host state for SEV-ES guests is categorized into three swap types + * based on how it is handled by hardware during a world switch: + * + * A: VMRUN: Host state saved in host save area + * VMEXIT: Host state loaded from host save area + * + * B: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state loaded from host save area + * + * C: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state initialized to default(reset) values + * + * Manually save type-B state, i.e. state that is loaded by VMEXIT but + * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed + * by common SVM code). */ - - /* XCR0 is restored on VMEXIT, save the current host value */ hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - - /* PKRU is restored on VMEXIT, save the current host value */ hostsa->pkru = read_pkru(); - - /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ hostsa->xss = host_xss; } -- cgit From 2837dd00f8fc69111cd6b1dc8481d2fb490d11c9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Jun 2023 16:37:52 +1000 Subject: KVM: SEV-ES: explicitly disable debug SVM/SEV enable debug registers intercepts to skip swapping DRs on entering/exiting the guest. When the guest is in control of debug registers (vcpu->guest_debug == 0), there is an optimisation to reduce the number of context switches: intercepts are cleared and the KVM_DEBUGREG_WONT_EXIT flag is set to tell KVM to do swapping on guest enter/exit. The same code also executes for SEV-ES, however it has no effect as - it always takes (vcpu->guest_debug == 0) branch; - KVM_DEBUGREG_WONT_EXIT is set but DR7 intercept is not cleared; - vcpu_enter_guest() writes DRs but VMRUN for SEV-ES swaps them with the values from _encrypted_ VMSA. Be explicit about SEV-ES not supporting debug: - return right away from dr_interception() and skip unnecessary processing; - return an error right away from the KVM_SEV_LAUNCH_UPDATE_VMSA handler if debugging was already enabled. KVM_SET_GUEST_DEBUG are failing already after KVM_SEV_LAUNCH_UPDATE_VMSA is finished due to vcpu->arch.guest_state_protected set to true. Add WARN_ON to kvm_x86::sync_dirty_debug_regs() (saves guest DRs on guest exit) to signify that SEV-ES won't hit that path. Suggested-by: Sean Christopherson Signed-off-by: Alexey Kardashevskiy Link: https://lore.kernel.org/r/20230615063757.3039121-5-aik@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 5 +++++ arch/x86/kvm/svm/svm.c | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d58a25a92aa3..0299bbe188f1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -619,6 +619,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, struct vcpu_svm *svm = to_svm(vcpu); int ret; + if (vcpu->guest_debug) { + pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); + return -EINVAL; + } + /* Perform some pre-encryption checks against the VMSA */ ret = sev_es_sync_vmsa(svm); if (ret) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a0e803f2a574..e838cfecc0fa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1983,7 +1983,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (vcpu->arch.guest_state_protected) + if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) return; get_debugreg(vcpu->arch.db[0], 0); @@ -2714,6 +2714,13 @@ static int dr_interception(struct kvm_vcpu *vcpu) unsigned long val; int err = 0; + /* + * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT + * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. + */ + if (sev_es_guest(vcpu->kvm)) + return 1; + if (vcpu->guest_debug == 0) { /* * No more DR vmexits; force a reload of the debug registers -- cgit From c2690b5f01948dfec19d50086312848c0b5b02b0 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Jun 2023 16:37:53 +1000 Subject: KVM: SVM/SEV/SEV-ES: Rework intercepts Currently SVM setup is done sequentially in init_vmcb() -> sev_init_vmcb() -> sev_es_init_vmcb() and tries keeping SVM/SEV/SEV-ES bits separated. One of the exceptions is DR intercepts which is for SEV-ES before sev_es_init_vmcb() runs. Move the SEV-ES intercept setup to sev_es_init_vmcb(). From now on set_dr_intercepts()/clr_dr_intercepts() handle SVM/SEV only. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Alexey Kardashevskiy Reviewed-by: Santosh Shukla Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20230615063757.3039121-6-aik@amd.com [sean: drop comment about intercepting DR7] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 6 ++++++ arch/x86/kvm/svm/svm.c | 37 ++++++++++++++----------------------- 2 files changed, 20 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0299bbe188f1..b3ef509f6fda 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2951,6 +2951,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; @@ -2979,6 +2980,11 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR4_WRITE); svm_set_intercept(svm, TRAP_CR8_WRITE); + vmcb->control.intercepts[INTERCEPT_DR] = 0; + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + recalc_intercepts(svm); + /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e838cfecc0fa..8fad9ebb5126 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -681,23 +681,20 @@ static void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; - if (!sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); - } - + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); @@ -710,12 +707,6 @@ static void clr_dr_intercepts(struct vcpu_svm *svm) vmcb->control.intercepts[INTERCEPT_DR] = 0; - /* DR7 access must remain intercepted for an SEV-ES guest */ - if (sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - } - recalc_intercepts(svm); } -- cgit From d1f85fbe836e61ed330a242f927a98abd98929cc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Jun 2023 16:37:54 +1000 Subject: KVM: SEV: Enable data breakpoints in SEV-ES Add support for "DebugSwap for SEV-ES guests", which provides support for swapping DR[0-3] and DR[0-3]_ADDR_MASK on VMRUN and VMEXIT, i.e. allows KVM to expose debug capabilities to SEV-ES guests. Without DebugSwap support, the CPU doesn't save/load most _guest_ debug registers (except DR6/7), and KVM cannot manually context switch guest DRs due the VMSA being encrypted. Enable DebugSwap if and only if the CPU also supports NoNestedDataBp, which causes the CPU to ignore nested #DBs, i.e. #DBs that occur when vectoring a #DB. Without NoNestedDataBp, a malicious guest can DoS the host by putting the CPU into an infinite loop of vectoring #DBs (see https://bugzilla.redhat.com/show_bug.cgi?id=1278496) Set the features bit in sev_es_sync_vmsa() which is the last point when VMSA is not encrypted yet as sev_(es_)init_vmcb() (where the most init happens) is called not only when VCPU is initialised but also on intrahost migration when VMSA is encrypted. Eliminate DR7 intercepts as KVM can't modify guest DR7, and intercepting DR7 would completely defeat the purpose of enabling DebugSwap. Make X86_FEATURE_DEBUG_SWAP appear in /proc/cpuinfo (by not adding "") to let the operator know if the VM can debug. Signed-off-by: Alexey Kardashevskiy Link: https://lore.kernel.org/r/20230615063757.3039121-7-aik@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/svm.h | 1 + arch/x86/kvm/svm/sev.c | 36 +++++++++++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cb8ca46213be..31c862d79fae 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -434,6 +434,7 @@ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index e7c7379d6ac7..72ebd5e4e975 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -288,6 +288,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) struct vmcb_seg { u16 selector; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b3ef509f6fda..b7cd0cc4a19c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "mmu.h" #include "x86.h" @@ -54,9 +55,14 @@ module_param_named(sev, sev_enabled, bool, 0444); /* enable/disable SEV-ES support */ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); + +/* enable/disable SEV-ES DebugSwap support */ +static bool sev_es_debug_swap_enabled = true; +module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); #else #define sev_enabled false #define sev_es_enabled false +#define sev_es_debug_swap_enabled false #endif /* CONFIG_KVM_AMD_SEV */ static u8 sev_enc_bit; @@ -606,6 +612,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + if (sev_es_debug_swap_enabled) + save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; + pr_debug("Virtual Machine Save Area (VMSA):\n"); print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); @@ -2261,6 +2270,9 @@ out: sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || + !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) + sev_es_debug_swap_enabled = false; #endif } @@ -2981,9 +2993,11 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR8_WRITE); vmcb->control.intercepts[INTERCEPT_DR] = 0; - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - recalc_intercepts(svm); + if (!sev_es_debug_swap_enabled) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + recalc_intercepts(svm); + } /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); @@ -3053,6 +3067,22 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); hostsa->pkru = read_pkru(); hostsa->xss = host_xss; + + /* + * If DebugSwap is enabled, debug registers are loaded but NOT saved by + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both + * saves and loads debug registers (Type-A). + */ + if (sev_es_debug_swap_enabled) { + hostsa->dr0 = native_get_debugreg(0); + hostsa->dr1 = native_get_debugreg(1); + hostsa->dr2 = native_get_debugreg(2); + hostsa->dr3 = native_get_debugreg(3); + hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); + hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); + hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); + hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); + } } void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) -- cgit From 90cbf6d914ad7856ca1145dee02babb9eab7bec1 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Jun 2023 16:37:55 +1000 Subject: KVM: SEV-ES: Eliminate #DB intercept when DebugSwap enabled Disable #DB for SEV-ES guests when DebugSwap is enabled. There is no point in such intercept as KVM does not allow guest debug for SEV-ES guests. Signed-off-by: Alexey Kardashevskiy Link: https://lore.kernel.org/r/20230615063757.3039121-8-aik@amd.com [sean: add comment as to why KVM disables #DB intercept iff DebugSwap=1] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b7cd0cc4a19c..b35cd670ce66 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2997,6 +2997,17 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); + } else { + /* + * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't + * allow debugging SEV-ES guests, and enables DebugSwap iff + * NO_NESTED_DATA_BP is supported, so there's no reason to + * intercept #DB when DebugSwap is enabled. For simplicity + * with respect to guest debug, intercept #DB for other VMs + * even if NO_NESTED_DATA_BP is supported, i.e. even if the + * guest can't DoS the CPU with infinite #DB vectoring. + */ + clr_exception_intercept(svm, DB_VECTOR); } /* Can't intercept XSETBV, HV can't modify XCR0 directly */ -- cgit From 389fbbec261b2842fd0e34b26a2b288b122cc406 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jun 2023 16:37:56 +1000 Subject: KVM: SVM: Don't defer NMI unblocking until next exit for SEV-ES guests Immediately mark NMIs as unmasked in response to #VMGEXIT(NMI complete) instead of setting awaiting_iret_completion and waiting until the *next* VM-Exit to unmask NMIs. The whole point of "NMI complete" is that the guest is responsible for telling the hypervisor when it's safe to inject an NMI, i.e. there's no need to wait. And because there's no IRET to single-step, the next VM-Exit could be a long time coming, i.e. KVM could incorrectly hold an NMI pending for far longer than what is required and expected. Opportunistically fix a stale reference to HF_IRET_MASK. Fixes: 916b54a7688b ("KVM: x86: Move HF_NMI_MASK and HF_IRET_MASK into "struct vcpu_svm"") Fixes: 4444dfe4050b ("KVM: SVM: Add NMI support for an SEV-ES guest") Cc: Tom Lendacky Link: https://lore.kernel.org/r/20230615063757.3039121-9-aik@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 5 ++++- arch/x86/kvm/svm/svm.c | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b35cd670ce66..2cd15783dfb9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2900,7 +2900,10 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: - ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); + ++vcpu->stat.nmi_window_exits; + svm->nmi_masked = false; + kvm_make_request(KVM_REQ_EVENT, vcpu); + ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: ret = kvm_emulate_ap_reset_hold(vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8fad9ebb5126..b15bc158ecc6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2535,12 +2535,13 @@ static int iret_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); + ++vcpu->stat.nmi_window_exits; svm->awaiting_iret_completion = true; svm_clr_iret_intercept(svm); - if (!sev_es_guest(vcpu->kvm)) - svm->nmi_iret_rip = kvm_rip_read(vcpu); + svm->nmi_iret_rip = kvm_rip_read(vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; @@ -3950,12 +3951,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) svm->soft_int_injected = false; /* - * If we've made progress since setting HF_IRET_MASK, we've + * If we've made progress since setting awaiting_iret_completion, we've * executed an IRET and can allow NMI injection. */ if (svm->awaiting_iret_completion && - (sev_es_guest(vcpu->kvm) || - kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { + kvm_rip_read(vcpu) != svm->nmi_iret_rip) { svm->awaiting_iret_completion = false; svm->nmi_masked = false; kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit From a6bb5709029756a6151afed0eec26ba2bfabed51 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jun 2023 16:37:57 +1000 Subject: KVM: SVM: Don't try to pointlessly single-step SEV-ES guests for NMI window Bail early from svm_enable_nmi_window() for SEV-ES guests without trying to enable single-step of the guest, as single-stepping an SEV-ES guest is impossible and the guest is responsible for *telling* KVM when it is ready for an new NMI to be injected. Functionally, setting TF and RF in svm->vmcb->save.rflags is benign as the field is ignored by hardware, but it's all kinds of confusing. Signed-off-by: Alexey Kardashevskiy Link: https://lore.kernel.org/r/20230615063757.3039121-10-aik@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b15bc158ecc6..1bc0936bbd51 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3802,6 +3802,19 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ + /* + * SEV-ES guests are responsible for signaling when a vCPU is ready to + * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. + * KVM can't intercept and single-step IRET to detect when NMIs are + * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. + * + * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware + * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not + * supported NAEs in the GHCB protocol. + */ + if (sev_es_guest(vcpu->kvm)) + return; + if (!gif_set(svm)) { if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); -- cgit From d518f8cc10af7a1265595fccf553a3a5f7121e4f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 7 Jun 2023 13:35:17 -0700 Subject: KVM: SVM: Fix dead KVM_BUG() code in LBR MSR virtualization Refactor KVM's handling of LBR MSRs on SVM to avoid a second layer of case statements, and thus eliminate a dead KVM_BUG() call, which (a) will never be hit in the current code base and (b) if a future commit breaks things, will never fire as KVM passes "false" instead "true" or '1' for the KVM_BUG() condition. Reported-by: Michal Luczaj Cc: Yuan Yao Link: https://lore.kernel.org/r/20230607203519.1570167-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1bc0936bbd51..623916221ff7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -980,43 +980,22 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); } -static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) +static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) { /* - * If the LBR virtualization is disabled, the LBR msrs are always - * kept in the vmcb01 to avoid copying them on nested guest entries. - * - * If nested, and the LBR virtualization is enabled/disabled, the msrs - * are moved between the vmcb01 and vmcb02 as needed. + * If LBR virtualization is disabled, the LBR MSRs are always kept in + * vmcb01. If LBR virtualization is enabled and L1 is running VMs of + * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. */ - struct vmcb *vmcb = - (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? - svm->vmcb : svm->vmcb01.ptr; - - switch (index) { - case MSR_IA32_DEBUGCTLMSR: - return vmcb->save.dbgctl; - case MSR_IA32_LASTBRANCHFROMIP: - return vmcb->save.br_from; - case MSR_IA32_LASTBRANCHTOIP: - return vmcb->save.br_to; - case MSR_IA32_LASTINTFROMIP: - return vmcb->save.last_excp_from; - case MSR_IA32_LASTINTTOIP: - return vmcb->save.last_excp_to; - default: - KVM_BUG(false, svm->vcpu.kvm, - "%s: Unknown MSR 0x%x", __func__, index); - return 0; - } + return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : + svm->vmcb01.ptr; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & - DEBUGCTLMSR_LBR; + bool enable_lbrv = svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR; bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); @@ -2835,11 +2814,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: + msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; + break; case MSR_IA32_LASTBRANCHFROMIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; + break; case MSR_IA32_LASTBRANCHTOIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; + break; case MSR_IA32_LASTINTFROMIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; + break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm_get_lbr_msr(svm, msr_info->index); + msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; -- cgit From 41dfb5f13ed9101b217e7085e5b5fb07d705e27f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 7 Jun 2023 13:35:18 -0700 Subject: KVM: SVM: Clean up handling of LBR virtualization enabled Clean up the enable_lbrv computation in svm_update_lbrv() to consolidate the logic for computing enable_lbrv into a single statement, and to remove the coding style violations (lack of curly braces on nested if). No functional change intended. Link: https://lore.kernel.org/r/20230607203519.1570167-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 623916221ff7..6fa918b588d2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -994,15 +994,10 @@ static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - - bool enable_lbrv = svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR; - - bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & - LBR_CTL_ENABLE_MASK); - - if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) - if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) - enable_lbrv = true; + bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; + bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || + (is_guest_mode(vcpu) && svm->lbrv_enabled && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) return; -- cgit From a85cd52d720588b0060a0ebac5c7f497ad6bcd86 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 7 Jun 2023 13:35:19 -0700 Subject: KVM: SVM: Use svm_get_lbr_vmcb() helper to handle writes to DEBUGCTL Use the recently introduced svm_get_lbr_vmcb() instead an open coded equivalent to retrieve the target VMCB when emulating writes to MSR_IA32_DEBUGCTLMSR. No functional change intended. Link: https://lore.kernel.org/r/20230607203519.1570167-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6fa918b588d2..d76c8a0764f1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3052,13 +3052,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) - svm->vmcb->save.dbgctl = data; - else - svm->vmcb01.ptr->save.dbgctl = data; - + svm_get_lbr_vmcb(svm)->save.dbgctl = data; svm_update_lbrv(vcpu); - break; case MSR_VM_HSAVE_PA: /* -- cgit From 4c08e737f056fec930b416a2bd37ed266d724f95 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 8 Aug 2023 16:31:31 -0700 Subject: KVM: SVM: Take and hold ir_list_lock when updating vCPU's Physical ID entry Hoist the acquisition of ir_list_lock from avic_update_iommu_vcpu_affinity() to its two callers, avic_vcpu_load() and avic_vcpu_put(), specifically to encapsulate the write to the vCPU's entry in the AVIC Physical ID table. This will allow a future fix to pull information from the Physical ID entry when updating the IRTE, without potentially consuming stale information, i.e. without racing with the vCPU being (un)loaded. Add a comment to call out that ir_list_lock does NOT protect against multiple writers, specifically that reading the Physical ID entry in avic_vcpu_put() outside of the lock is safe. To preserve some semblance of independence from ir_list_lock, keep the READ_ONCE() in avic_vcpu_load() even though acuiring the spinlock effectively ensures the load(s) will be generated after acquiring the lock. Cc: stable@vger.kernel.org Tested-by: Alejandro Jimenez Reviewed-by: Joao Martins Link: https://lore.kernel.org/r/20230808233132.2499764-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index cfc8ab773025..8e041b215ddb 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -986,10 +986,11 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { int ret = 0; - unsigned long flags; struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_held(&svm->ir_list_lock); + if (!kvm_arch_has_assigned_device(vcpu->kvm)) return 0; @@ -997,19 +998,15 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) * Here, we go through the per-vcpu ir_list to update all existing * interrupt remapping table entry targeting this vcpu. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - if (list_empty(&svm->ir_list)) - goto out; + return 0; list_for_each_entry(ir, &svm->ir_list, node) { ret = amd_iommu_update_ga(cpu, r, ir->data); if (ret) - break; + return ret; } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; + return 0; } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1017,6 +1014,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 entry; int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; lockdep_assert_preemption_disabled(); @@ -1033,6 +1031,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_vcpu_is_blocking(vcpu)) return; + spin_lock_irqsave(&svm->ir_list_lock, flags); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); @@ -1042,25 +1042,40 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + + spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_put(struct kvm_vcpu *vcpu) { u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; lockdep_assert_preemption_disabled(); + /* + * Note, reading the Physical ID entry outside of ir_list_lock is safe + * as only the pCPU that has loaded (or is loading) the vCPU is allowed + * to modify the entry, and preemption is disabled. I.e. the vCPU + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run + * recursively. + */ entry = READ_ONCE(*(svm->avic_physical_id_cache)); /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) return; + spin_lock_irqsave(&svm->ir_list_lock, flags); + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) -- cgit From f3cebc75e7425d6949d726bb8e937095b0aef025 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 8 Aug 2023 16:31:32 -0700 Subject: KVM: SVM: Set target pCPU during IRTE update if target vCPU is running Update the target pCPU for IOMMU doorbells when updating IRTE routing if KVM is actively running the associated vCPU. KVM currently only updates the pCPU when loading the vCPU (via avic_vcpu_load()), and so doorbell events will be delayed until the vCPU goes through a put+load cycle (which might very well "never" happen for the lifetime of the VM). To avoid inserting a stale pCPU, e.g. due to racing between updating IRTE routing and vCPU load/put, get the pCPU information from the vCPU's Physical APIC ID table entry (a.k.a. avic_physical_id_cache in KVM) and update the IRTE while holding ir_list_lock. Add comments with --verbose enabled to explain exactly what is and isn't protected by ir_list_lock. Fixes: 411b44ba80ab ("svm: Implements update_pi_irte hook to setup posted interrupt") Reported-by: dengqiao.joey Cc: stable@vger.kernel.org Cc: Alejandro Jimenez Cc: Joao Martins Cc: Maxim Levitsky Cc: Suravee Suthikulpanit Tested-by: Alejandro Jimenez Reviewed-by: Joao Martins Link: https://lore.kernel.org/r/20230808233132.2499764-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8e041b215ddb..2092db892d7d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -791,6 +791,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) int ret = 0; unsigned long flags; struct amd_svm_iommu_ir *ir; + u64 entry; /** * In some cases, the existing irte is updated and re-set, @@ -824,6 +825,18 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) ir->data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); + + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is running. + * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM + * will update the pCPU info when the vCPU awkened and/or scheduled in. + * See also avic_vcpu_load(). + */ + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, + true, pi->ir_data); + list_add(&ir->node, &svm->ir_list); spin_unlock_irqrestore(&svm->ir_list_lock, flags); out: @@ -1031,6 +1044,13 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_vcpu_is_blocking(vcpu)) return; + /* + * Grab the per-vCPU interrupt remapping lock even if the VM doesn't + * _currently_ have assigned devices, as that can change. Holding + * ir_list_lock ensures that either svm_ir_list_add() will consume + * up-to-date entry information, or that this task will wait until + * svm_ir_list_add() completes to set the new target pCPU. + */ spin_lock_irqsave(&svm->ir_list_lock, flags); entry = READ_ONCE(*(svm->avic_physical_id_cache)); @@ -1067,6 +1087,14 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) return; + /* + * Take and hold the per-vCPU interrupt remapping lock while updating + * the Physical ID entry even though the lock doesn't protect against + * multiple writers (see above). Holding ir_list_lock ensures that + * either svm_ir_list_add() will consume up-to-date entry information, + * or that this task will wait until svm_ir_list_add() completes to + * mark the vCPU as not running. + */ spin_lock_irqsave(&svm->ir_list_lock, flags); avic_update_iommu_vcpu_affinity(vcpu, -1, 0); -- cgit From f67063414c0e83bb4a9e12358cc179af53c2a8bb Mon Sep 17 00:00:00 2001 From: Manali Shukla Date: Mon, 17 Jul 2023 04:19:03 +0000 Subject: KVM: SVM: correct the size of spec_ctrl field in VMCB save area Correct the spec_ctrl field in the VMCB save area based on the AMD Programmer's manual. Originally, the spec_ctrl was listed as u32 with 4 bytes of reserved area. The AMD Programmer's Manual now lists the spec_ctrl as 8 bytes in VMCB save area. The Public Processor Programming reference for Genoa, shows SPEC_CTRL as 64b register, but the AMD Programmer's Manual lists SPEC_CTRL as 32b register. This discrepancy will be cleaned up in next revision of the AMD Programmer's Manual. Since remaining bits above bit 7 are reserved bits in SPEC_CTRL MSR and thus, not being used, the spec_ctrl added as u32 in the VMCB save area is currently not an issue. Fixes: 3dd2775b74c9 ("KVM: SVM: Create a separate mapping for the SEV-ES save area") Suggested-by: Tom Lendacky Signed-off-by: Manali Shukla Link: https://lore.kernel.org/r/20230717041903.85480-1-manali.shukla@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 72ebd5e4e975..19bf955b67e0 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -346,7 +346,7 @@ struct vmcb_save_area { u64 last_excp_from; u64 last_excp_to; u8 reserved_0x298[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; /* Save area definition for SEV-ES and SEV-SNP guests */ @@ -513,7 +513,7 @@ struct ghcb { } __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 -- cgit From f1187ef24eb8f36e8ad8106d22615ceddeea6097 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Aug 2023 19:23:56 -0700 Subject: KVM: SVM: Get source vCPUs from source VM for SEV-ES intrahost migration Fix a goof where KVM tries to grab source vCPUs from the destination VM when doing intrahost migration. Grabbing the wrong vCPU not only hoses the guest, it also crashes the host due to the VMSA pointer being left NULL. BUG: unable to handle page fault for address: ffffe38687000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 39 PID: 17143 Comm: sev_migrate_tes Tainted: GO 6.5.0-smp--fff2e47e6c3b-next #151 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 34.28.0 07/10/2023 RIP: 0010:__free_pages+0x15/0xd0 RSP: 0018:ffff923fcf6e3c78 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffe38687000000 RCX: 0000000000000100 RDX: 0000000000000100 RSI: 0000000000000000 RDI: ffffe38687000000 RBP: ffff923fcf6e3c88 R08: ffff923fcafb0000 R09: 0000000000000000 R10: 0000000000000000 R11: ffffffff83619b90 R12: ffff923fa9540000 R13: 0000000000080007 R14: ffff923f6d35d000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff929d0d7c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffe38687000000 CR3: 0000005224c34005 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: sev_free_vcpu+0xcb/0x110 [kvm_amd] svm_vcpu_free+0x75/0xf0 [kvm_amd] kvm_arch_vcpu_destroy+0x36/0x140 [kvm] kvm_destroy_vcpus+0x67/0x100 [kvm] kvm_arch_destroy_vm+0x161/0x1d0 [kvm] kvm_put_kvm+0x276/0x560 [kvm] kvm_vm_release+0x25/0x30 [kvm] __fput+0x106/0x280 ____fput+0x12/0x20 task_work_run+0x86/0xb0 do_exit+0x2e3/0x9c0 do_group_exit+0xb1/0xc0 __x64_sys_exit_group+0x1b/0x20 do_syscall_64+0x41/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd CR2: ffffe38687000000 Fixes: 6defa24d3b12 ("KVM: SEV: Init target VMCBs in sev_migrate_from") Cc: stable@vger.kernel.org Cc: Peter Gonda Reviewed-by: Peter Gonda Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20230825022357.2852133-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 2cd15783dfb9..acc700bcb299 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1739,7 +1739,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * Note, the source is not required to have the same number of * vCPUs as the destination when migrating a vanilla SEV VM. */ - src_vcpu = kvm_get_vcpu(dst_kvm, i); + src_vcpu = kvm_get_vcpu(src_kvm, i); src_svm = to_svm(src_vcpu); /* -- cgit From 1952e74da96fb3e48b72a2d0ece78c688a5848c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Aug 2023 19:23:57 -0700 Subject: KVM: SVM: Skip VMSA init in sev_es_init_vmcb() if pointer is NULL Skip initializing the VMSA physical address in the VMCB if the VMSA is NULL, which occurs during intrahost migration as KVM initializes the VMCB before copying over state from the source to the destination (including the VMSA and its physical address). In normal builds, __pa() is just math, so the bug isn't fatal, but with CONFIG_DEBUG_VIRTUAL=y, the validity of the virtual address is verified and passing in NULL will make the kernel unhappy. Fixes: 6defa24d3b12 ("KVM: SEV: Init target VMCBs in sev_migrate_from") Cc: stable@vger.kernel.org Cc: Peter Gonda Reviewed-by: Peter Gonda Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20230825022357.2852133-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index acc700bcb299..5585a3556179 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2975,9 +2975,12 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* * An SEV-ES guest requires a VMSA area that is a separate from the * VMCB page. Do not include the encryption mask on the VMSA physical - * address since hardware will access it using the guest key. + * address since hardware will access it using the guest key. Note, + * the VMSA will be NULL if this vCPU is the destination for intrahost + * migration, and will be copied later. */ - svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (svm->sev_es.vmsa) + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); -- cgit From cb49631ad111570f1bad37702c11c2ae07fa2e3c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Aug 2023 18:36:18 -0700 Subject: KVM: SVM: Don't inject #UD if KVM attempts to skip SEV guest insn Don't inject a #UD if KVM attempts to "emulate" to skip an instruction for an SEV guest, and instead resume the guest and hope that it can make forward progress. When commit 04c40f344def ("KVM: SVM: Inject #UD on attempted emulation for SEV guest w/o insn buffer") added the completely arbitrary #UD behavior, there were no known scenarios where a well-behaved guest would induce a VM-Exit that triggered emulation, i.e. it was thought that injecting #UD would be helpful. However, now that KVM (correctly) attempts to re-inject INT3/INTO, e.g. if a #NPF is encountered when attempting to deliver the INT3/INTO, an SEV guest can trigger emulation without a buffer, through no fault of its own. Resuming the guest and retrying the INT3/INTO is architecturally wrong, e.g. the vCPU will incorrectly re-hit code #DBs, but for SEV guests there is literally no other option that has a chance of making forward progress. Drop the #UD injection for all "skip" emulation, not just those related to INT3/INTO, even though that means that the guest will likely end up in an infinite loop instead of getting a #UD (the vCPU may also crash, e.g. if KVM emulated everything about an instruction except for advancing RIP). There's no evidence that suggests that an unexpected #UD is actually better than hanging the vCPU, e.g. a soft-hung vCPU can still respond to IRQs and NMIs to generate a backtrace. Reported-by: Wu Zongyo Closes: https://lore.kernel.org/all/8eb933fd-2cf3-d7a9-32fe-2a1d82eac42a@mail.ustc.edu.cn Fixes: 6ef88d6e36c2 ("KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction") Cc: stable@vger.kernel.org Cc: Tom Lendacky Link: https://lore.kernel.org/r/20230825013621.2845700-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d76c8a0764f1..d7a474571ff1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -365,6 +365,8 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; } +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, bool commit_side_effects) @@ -385,6 +387,14 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, } if (!svm->next_rip) { + /* + * FIXME: Drop this when kvm_emulate_instruction() does the + * right thing and treats "can't emulate" as outright failure + * for EMULTYPE_SKIP. + */ + if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0)) + return 0; + if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; @@ -4677,16 +4687,25 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and * decode garbage. * - * Inject #UD if KVM reached this point without an instruction buffer. - * In practice, this path should never be hit by a well-behaved guest, - * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path - * is still theoretically reachable, e.g. via unaccelerated fault-like - * AVIC access, and needs to be handled by KVM to avoid putting the - * guest into an infinite loop. Injecting #UD is somewhat arbitrary, - * but its the least awful option given lack of insight into the guest. + * If KVM is NOT trying to simply skip an instruction, inject #UD if + * KVM reached this point without an instruction buffer. In practice, + * this path should never be hit by a well-behaved guest, e.g. KVM + * doesn't intercept #UD or #GP for SEV guests, but this path is still + * theoretically reachable, e.g. via unaccelerated fault-like AVIC + * access, and needs to be handled by KVM to avoid putting the guest + * into an infinite loop. Injecting #UD is somewhat arbitrary, but + * its the least awful option given lack of insight into the guest. + * + * If KVM is trying to skip an instruction, simply resume the guest. + * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM + * will attempt to re-inject the INT3/INTO and skip the instruction. + * In that scenario, retrying the INT3/INTO and hoping the guest will + * make forward progress is the only option that has a chance of + * success (and in practice it will work the vast majority of the time). */ if (unlikely(!insn)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (!(emul_type & EMULTYPE_SKIP)) + kvm_queue_exception(vcpu, UD_VECTOR); return false; } -- cgit From 80d0f521d59e08eeaa0bc5d624da139448fb99b8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Aug 2023 18:36:19 -0700 Subject: KVM: SVM: Require nrips support for SEV guests (and beyond) Disallow SEV (and beyond) if nrips is disabled via module param, as KVM can't read guest memory to partially emulate and skip an instruction. All CPUs that support SEV support NRIPS, i.e. this is purely stopping the user from shooting themselves in the foot. Cc: Tom Lendacky Link: https://lore.kernel.org/r/20230825013621.2845700-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 11 ++++------- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5585a3556179..85d1abdf7d7d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2185,7 +2185,7 @@ void __init sev_hardware_setup(void) bool sev_es_supported = false; bool sev_supported = false; - if (!sev_enabled || !npt_enabled) + if (!sev_enabled || !npt_enabled || !nrips) goto out; /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d7a474571ff1..5cf2380c89dd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -203,7 +203,7 @@ static int nested = true; module_param(nested, int, S_IRUGO); /* enable/disable Next RIP Save */ -static int nrips = true; +int nrips = true; module_param(nrips, int, 0444); /* enable/disable Virtual VMLOAD VMSAVE */ @@ -5156,9 +5156,11 @@ static __init int svm_hardware_setup(void) svm_adjust_mmio_mask(); + nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which - * may be modified by svm_adjust_mmio_mask()). + * may be modified by svm_adjust_mmio_mask()), as well as nrips. */ sev_hardware_setup(); @@ -5170,11 +5172,6 @@ static __init int svm_hardware_setup(void) goto err; } - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 800ca1776b59..1498956a589f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -33,6 +33,7 @@ #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern int nrips; extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; -- cgit