From 910c57dfa4d113aae6571c2a8b9ae8c430975902 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 14 Feb 2024 17:00:03 -0800 Subject: KVM: x86: Mark target gfn of emulated atomic instruction as dirty When emulating an atomic access on behalf of the guest, mark the target gfn dirty if the CMPXCHG by KVM is attempted and doesn't fault. This fixes a bug where KVM effectively corrupts guest memory during live migration by writing to guest memory without informing userspace that the page is dirty. Marking the page dirty got unintentionally dropped when KVM's emulated CMPXCHG was converted to do a user access. Before that, KVM explicitly mapped the guest page into kernel memory, and marked the page dirty during the unmap phase. Mark the page dirty even if the CMPXCHG fails, as the old data is written back on failure, i.e. the page is still written. The value written is guaranteed to be the same because the operation is atomic, but KVM's ABI is that all writes are dirty logged regardless of the value written. And more importantly, that's what KVM did before the buggy commit. Huge kudos to the folks on the Cc list (and many others), who did all the actual work of triaging and debugging. Fixes: 1c2361f667f3 ("KVM: x86: Use __try_cmpxchg_user() to emulate atomic accesses") Cc: stable@vger.kernel.org Cc: David Matlack Cc: Pasha Tatashin Cc: Michael Krebs base-commit: 6769ea8da8a93ed4630f1ce64df6aafcaabfce64 Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20240215010004.1456078-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 48a61d283406..e4270eaa33df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8007,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (r < 0) return X86EMUL_UNHANDLEABLE; + + /* + * Mark the page dirty _before_ checking whether or not the CMPXCHG was + * successful, as the old value is written back on failure. Note, for + * live migration, this is unnecessarily conservative as CMPXCHG writes + * back the original value and the access is atomic, but KVM's ABI is + * that all writes are dirty logged, regardless of the value written. + */ + kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa)); + if (r) return X86EMUL_CMPXCHG_FAILED; -- cgit From 422692098c4c53a6b65c2ef235621aee6a38721f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:09 -0800 Subject: KVM: x86: Update KVM_SW_PROTECTED_VM docs to make it clear they're a WIP Rewrite the help message for KVM_SW_PROTECTED_VM to make it clear that software-protected VMs are a development and testing vehicle for guest_memfd(), and that attempting to use KVM_SW_PROTECTED_VM for anything remotely resembling a "real" VM will fail. E.g. any memory accesses from KVM will incorrectly access shared memory, nested TDP is wildly broken, and so on and so forth. Update KVM's API documentation with similar warnings to discourage anyone from attempting to run anything but selftests with KVM_X86_SW_PROTECTED_VM. Fixes: 89ea60c2c7b5 ("KVM: x86: Add support for "protected VMs" that can utilize private memory") Link: https://lore.kernel.org/r/20240222190612.2942589-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 87e3da7b0439..65ed14b6540b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -80,9 +80,10 @@ config KVM_SW_PROTECTED_VM depends on KVM && X86_64 select KVM_GENERIC_PRIVATE_MEM help - Enable support for KVM software-protected VMs. Currently "protected" - means the VM can be backed with memory provided by - KVM_CREATE_GUEST_MEMFD. + Enable support for KVM software-protected VMs. Currently, software- + protected VMs are purely a development and testing vehicle for + KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a + software-protected VM will fail miserably. If unsure, say "N". -- cgit From a1176ef5c92aa58e63ecf184b7cac2e311b2b233 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:10 -0800 Subject: KVM: x86/mmu: Restrict KVM_SW_PROTECTED_VM to the TDP MMU Advertise and support software-protected VMs if and only if the TDP MMU is enabled, i.e. disallow KVM_SW_PROTECTED_VM if TDP is enabled for KVM's legacy/shadow MMU. TDP support for the shadow MMU is maintenance-only, e.g. support for TDX and SNP will also be restricted to the TDP MMU. Fixes: 89ea60c2c7b5 ("KVM: x86: Add support for "protected VMs" that can utilize private memory") Link: https://lore.kernel.org/r/20240222190612.2942589-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 48a61d283406..3638a104bcf7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4580,7 +4580,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) { return type == KVM_X86_DEFAULT_VM || (type == KVM_X86_SW_PROTECTED_VM && - IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled); + IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) -- cgit From 5ef1d8c1ddbf696e47b226e11888eaf8d9e8e807 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 Feb 2024 17:34:30 -0800 Subject: KVM: SVM: Flush pages under kvm->lock to fix UAF in svm_register_enc_region() Do the cache flush of converted pages in svm_register_enc_region() before dropping kvm->lock to fix use-after-free issues where region and/or its array of pages could be freed by a different task, e.g. if userspace has __unregister_enc_region_locked() already queued up for the region. Note, the "obvious" alternative of using local variables doesn't fully resolve the bug, as region->pages is also dynamically allocated. I.e. the region structure itself would be fine, but region->pages could be freed. Flushing multiple pages under kvm->lock is unfortunate, but the entire flow is a rare slow path, and the manual flush is only needed on CPUs that lack coherency for encrypted memory. Fixes: 19a23da53932 ("Fix unsynchronized access to sev members through svm_register_enc_region") Reported-by: Gabe Kirkpatrick Cc: Josh Eads Cc: Peter Gonda Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20240217013430.2079561-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f760106c31f8..a132547fcfb5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1975,20 +1975,22 @@ int sev_mem_enc_register_region(struct kvm *kvm, goto e_free; } - region->uaddr = range->addr; - region->size = range->size; - - list_add_tail(®ion->list, &sev->regions_list); - mutex_unlock(&kvm->lock); - /* * The guest may change the memory encryption attribute from C=0 -> C=1 * or vice versa for this memory range. Lets make sure caches are * flushed to ensure that guest data gets written into memory with - * correct C-bit. + * correct C-bit. Note, this must be done before dropping kvm->lock, + * as region and its array of pages can be freed by a different task + * once kvm->lock is released. */ sev_clflush_pages(region->pages, region->npages); + region->uaddr = range->addr; + region->size = range->size; + + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + return ret; e_free: -- cgit From d02c357e5bfa7dfd618b7b3015624beb71f58f1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 21 Feb 2024 17:26:40 -0800 Subject: KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing Retry page faults without acquiring mmu_lock, and without even faulting the page into the primary MMU, if the resolved gfn is covered by an active invalidation. Contending for mmu_lock is especially problematic on preemptible kernels as the mmu_notifier invalidation task will yield mmu_lock (see rwlock_needbreak()), delay the in-progress invalidation, and ultimately increase the latency of resolving the page fault. And in the worst case scenario, yielding will be accompanied by a remote TLB flush, e.g. if the invalidation covers a large range of memory and vCPUs are accessing addresses that were already zapped. Faulting the page into the primary MMU is similarly problematic, as doing so may acquire locks that need to be taken for the invalidation to complete (the primary MMU has finer grained locks than KVM's MMU), and/or may cause unnecessary churn (getting/putting pages, marking them accessed, etc). Alternatively, the yielding issue could be mitigated by teaching KVM's MMU iterators to perform more work before yielding, but that wouldn't solve the lock contention and would negatively affect scenarios where a vCPU is trying to fault in an address that is NOT covered by the in-progress invalidation. Add a dedicated lockess version of the range-based retry check to avoid false positives on the sanity check on start+end WARN, and so that it's super obvious that checking for a racing invalidation without holding mmu_lock is unsafe (though obviously useful). Wrap mmu_invalidate_in_progress in READ_ONCE() to ensure that pre-checking invalidation in a loop won't put KVM into an infinite loop, e.g. due to caching the in-progress flag and never seeing it go to '0'. Force a load of mmu_invalidate_seq as well, even though it isn't strictly necessary to avoid an infinite loop, as doing so improves the probability that KVM will detect an invalidation that already completed before acquiring mmu_lock and bailing anyways. Do the pre-check even for non-preemptible kernels, as waiting to detect the invalidation until mmu_lock is held guarantees the vCPU will observe the worst case latency in terms of handling the fault, and can generate even more mmu_lock contention. E.g. the vCPU will acquire mmu_lock, detect retry, drop mmu_lock, re-enter the guest, retake the fault, and eventually re-acquire mmu_lock. This behavior is also why there are no new starvation issues due to losing the fairness guarantees provided by rwlocks: if the vCPU needs to retry, it _must_ drop mmu_lock, i.e. waiting on mmu_lock doesn't guarantee forward progress in the face of _another_ mmu_notifier invalidation event. Note, adding READ_ONCE() isn't entirely free, e.g. on x86, the READ_ONCE() may generate a load into a register instead of doing a direct comparison (MOV+TEST+Jcc instead of CMP+Jcc), but practically speaking the added cost is a few bytes of code and maaaaybe a cycle or three. Reported-by: Yan Zhao Closes: https://lore.kernel.org/all/ZNnPF4W26ZbAyGto@yzhao56-desk.sh.intel.com Reported-by: Friedrich Weber Cc: Kai Huang Cc: Yan Zhao Cc: Yuan Yao Cc: Xu Yilun Acked-by: Kai Huang Reviewed-by: Yan Zhao Link: https://lore.kernel.org/r/20240222012640.2820927-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d6cdeab1f8a..0544700ca50b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); + /* + * Check for a relevant mmu_notifier invalidation event before getting + * the pfn from the primary MMU, and before acquiring mmu_lock. + * + * For mmu_lock, if there is an in-progress invalidation and the kernel + * allows preemption, the invalidation task may drop mmu_lock and yield + * in response to mmu_lock being contended, which is *very* counter- + * productive as this vCPU can't actually make forward progress until + * the invalidation completes. + * + * Retrying now can also avoid unnessary lock contention in the primary + * MMU, as the primary MMU doesn't necessarily hold a single lock for + * the duration of the invalidation, i.e. faulting in a conflicting pfn + * can cause the invalidation to take longer by holding locks that are + * needed to complete the invalidation. + * + * Do the pre-check even for non-preemtible kernels, i.e. even if KVM + * will never yield mmu_lock in response to contention, as this vCPU is + * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held + * to detect retry guarantees the worst case latency for the vCPU. + */ + if (fault->slot && + mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + return RET_PF_RETRY; + ret = __kvm_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; @@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(!fault->slot)) return kvm_handle_noslot_fault(vcpu, fault, access); + /* + * Check again for a relevant mmu_notifier invalidation event purely to + * avoid contending mmu_lock. Most invalidations will be detected by + * the previous check, but checking is extremely cheap relative to the + * overall cost of failing to detect the invalidation until after + * mmu_lock is acquired. + */ + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { + kvm_release_pfn_clean(fault->pfn); + return RET_PF_RETRY; + } + return RET_PF_CONTINUE; } @@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; + /* + * Check for a relevant mmu_notifier invalidation event one last time + * now that mmu_lock is held, as the "unsafe" checks performed without + * holding mmu_lock can get false negatives. + */ return fault->slot && mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } -- cgit From 5abf6dceb066f2b02b225fd561440c98a8062681 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 9 Mar 2024 11:24:58 -0500 Subject: SEV: disable SEV-ES DebugSwap by default The DebugSwap feature of SEV-ES provides a way for confidential guests to use data breakpoints. However, because the status of the DebugSwap feature is recorded in the VMSA, enabling it by default invalidates the attestation signatures. In 6.10 we will introduce a new API to create SEV VMs that will allow enabling DebugSwap based on what the user tells KVM to do. Contextually, we will change the legacy KVM_SEV_ES_INIT API to never enable DebugSwap. For compatibility with kernels that pre-date the introduction of DebugSwap, as well as with those where KVM_SEV_ES_INIT will never enable it, do not enable the feature by default. If anybody wants to use it, for now they can enable the sev_es_debug_swap_enabled module parameter, but this will result in a warning. Fixes: d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a132547fcfb5..a8ce5226b3b5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -57,7 +57,7 @@ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); /* enable/disable SEV-ES DebugSwap support */ -static bool sev_es_debug_swap_enabled = true; +static bool sev_es_debug_swap_enabled = false; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); #else #define sev_enabled false @@ -612,8 +612,11 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - if (sev_es_debug_swap_enabled) + if (sev_es_debug_swap_enabled) { save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; + pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. " + "This will not work starting with Linux 6.10\n"); + } pr_debug("Virtual Machine Save Area (VMSA):\n"); print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); -- cgit