summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-10 09:27:39 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-10 09:27:39 -0700
commit137e0ec05aeb657472d2f84dbb3081016160334b (patch)
tree14968f4e17b9c8c961b78fdb51c78266cc5fa772 /arch
parent005f6f34bd47eaa61d939a2727fc648e687b84c1 (diff)
parent5abf6dceb066f2b02b225fd561440c98a8062681 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "KVM GUEST_MEMFD fixes for 6.8: - Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY to avoid creating an inconsistent ABI (KVM_MEM_GUEST_MEMFD is not writable from userspace, so there would be no way to write to a read-only guest_memfd). - Update documentation for KVM_SW_PROTECTED_VM to make it abundantly clear that such VMs are purely for development and testing. - Limit KVM_SW_PROTECTED_VM guests to the TDP MMU, as the long term plan is to support confidential VMs with deterministic private memory (SNP and TDX) only in the TDP MMU. - Fix a bug in a GUEST_MEMFD dirty logging test that caused false passes. x86 fixes: - Fix missing marking of a guest page as dirty when emulating an atomic access. - Check for mmu_notifier invalidation events before faulting in the pfn, and before acquiring mmu_lock, to avoid unnecessary work and lock contention with preemptible kernels (including CONFIG_PREEMPT_DYNAMIC in non-preemptible mode). - Disable AMD DebugSwap by default, it breaks VMSA signing and will be re-enabled with a better VM creation API in 6.10. - Do the cache flush of converted pages in svm_register_enc_region() before dropping kvm->lock, to avoid a race with unregistering of the same region and the consequent use-after-free issue" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: SEV: disable SEV-ES DebugSwap by default KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing KVM: SVM: Flush pages under kvm->lock to fix UAF in svm_register_enc_region() KVM: selftests: Add a testcase to verify GUEST_MEMFD and READONLY are exclusive KVM: selftests: Create GUEST_MEMFD for relevant invalid flags testcases KVM: x86/mmu: Restrict KVM_SW_PROTECTED_VM to the TDP MMU KVM: x86: Update KVM_SW_PROTECTED_VM docs to make it clear they're a WIP KVM: Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY KVM: x86: Mark target gfn of emulated atomic instruction as dirty
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/mmu/mmu.c42
-rw-r--r--arch/x86/kvm/svm/sev.c23
-rw-r--r--arch/x86/kvm/x86.c12
4 files changed, 71 insertions, 13 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 87e3da7b0439..65ed14b6540b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -80,9 +80,10 @@ config KVM_SW_PROTECTED_VM
depends on KVM && X86_64
select KVM_GENERIC_PRIVATE_MEM
help
- Enable support for KVM software-protected VMs. Currently "protected"
- means the VM can be backed with memory provided by
- KVM_CREATE_GUEST_MEMFD.
+ Enable support for KVM software-protected VMs. Currently, software-
+ protected VMs are purely a development and testing vehicle for
+ KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a
+ software-protected VM will fail miserably.
If unsure, say "N".
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2d6cdeab1f8a..0544700ca50b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
+ /*
+ * Check for a relevant mmu_notifier invalidation event before getting
+ * the pfn from the primary MMU, and before acquiring mmu_lock.
+ *
+ * For mmu_lock, if there is an in-progress invalidation and the kernel
+ * allows preemption, the invalidation task may drop mmu_lock and yield
+ * in response to mmu_lock being contended, which is *very* counter-
+ * productive as this vCPU can't actually make forward progress until
+ * the invalidation completes.
+ *
+ * Retrying now can also avoid unnessary lock contention in the primary
+ * MMU, as the primary MMU doesn't necessarily hold a single lock for
+ * the duration of the invalidation, i.e. faulting in a conflicting pfn
+ * can cause the invalidation to take longer by holding locks that are
+ * needed to complete the invalidation.
+ *
+ * Do the pre-check even for non-preemtible kernels, i.e. even if KVM
+ * will never yield mmu_lock in response to contention, as this vCPU is
+ * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
+ * to detect retry guarantees the worst case latency for the vCPU.
+ */
+ if (fault->slot &&
+ mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ return RET_PF_RETRY;
+
ret = __kvm_faultin_pfn(vcpu, fault);
if (ret != RET_PF_CONTINUE)
return ret;
@@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (unlikely(!fault->slot))
return kvm_handle_noslot_fault(vcpu, fault, access);
+ /*
+ * Check again for a relevant mmu_notifier invalidation event purely to
+ * avoid contending mmu_lock. Most invalidations will be detected by
+ * the previous check, but checking is extremely cheap relative to the
+ * overall cost of failing to detect the invalidation until after
+ * mmu_lock is acquired.
+ */
+ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
+ kvm_release_pfn_clean(fault->pfn);
+ return RET_PF_RETRY;
+ }
+
return RET_PF_CONTINUE;
}
@@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
return true;
+ /*
+ * Check for a relevant mmu_notifier invalidation event one last time
+ * now that mmu_lock is held, as the "unsafe" checks performed without
+ * holding mmu_lock can get false negatives.
+ */
return fault->slot &&
mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f760106c31f8..a8ce5226b3b5 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -57,7 +57,7 @@ static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = true;
+static bool sev_es_debug_swap_enabled = false;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
#else
#define sev_enabled false
@@ -612,8 +612,11 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- if (sev_es_debug_swap_enabled)
+ if (sev_es_debug_swap_enabled) {
save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+ pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
+ "This will not work starting with Linux 6.10\n");
+ }
pr_debug("Virtual Machine Save Area (VMSA):\n");
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
@@ -1975,20 +1978,22 @@ int sev_mem_enc_register_region(struct kvm *kvm,
goto e_free;
}
- region->uaddr = range->addr;
- region->size = range->size;
-
- list_add_tail(&region->list, &sev->regions_list);
- mutex_unlock(&kvm->lock);
-
/*
* The guest may change the memory encryption attribute from C=0 -> C=1
* or vice versa for this memory range. Lets make sure caches are
* flushed to ensure that guest data gets written into memory with
- * correct C-bit.
+ * correct C-bit. Note, this must be done before dropping kvm->lock,
+ * as region and its array of pages can be freed by a different task
+ * once kvm->lock is released.
*/
sev_clflush_pages(region->pages, region->npages);
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
return ret;
e_free:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48a61d283406..e02cc710f56d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4580,7 +4580,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
{
return type == KVM_X86_DEFAULT_VM ||
(type == KVM_X86_SW_PROTECTED_VM &&
- IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled);
+ IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -8007,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (r < 0)
return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * Mark the page dirty _before_ checking whether or not the CMPXCHG was
+ * successful, as the old value is written back on failure. Note, for
+ * live migration, this is unnecessarily conservative as CMPXCHG writes
+ * back the original value and the access is atomic, but KVM's ABI is
+ * that all writes are dirty logged, regardless of the value written.
+ */
+ kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
+
if (r)
return X86EMUL_CMPXCHG_FAILED;