summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r--arch/x86/kvm/mmu/mmu.c1678
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h143
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h3
-rw-r--r--arch/x86/kvm/mmu/page_track.c70
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h123
-rw-r--r--arch/x86/kvm/mmu/spte.c188
-rw-r--r--arch/x86/kvm/mmu/spte.h139
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c10
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h23
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c930
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h60
11 files changed, 1984 insertions, 1383 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0544700ca50b..8160870398b9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -47,18 +47,18 @@
#include <linux/kern_levels.h>
#include <linux/kstrtox.h>
#include <linux/kthread.h>
+#include <linux/wordpart.h>
#include <asm/page.h>
#include <asm/memtype.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
#include <asm/set_memory.h>
+#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
#include "trace.h"
-extern bool itlb_multihit_kvm_mitigation;
-
static bool nx_hugepage_mitigation_hard_disabled;
int __read_mostly nx_huge_pages = -1;
@@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
static struct kmem_cache *pte_list_desc_cache;
struct kmem_cache *mmu_page_header_cache;
-static struct percpu_counter kvm_total_used_mmu_pages;
static void mmu_spte_set(u64 *sptep, u64 spte);
@@ -263,7 +262,7 @@ static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu)
{
- if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
return kvm_read_cr3(vcpu);
return mmu->get_guest_pgd(vcpu);
@@ -336,16 +335,19 @@ static int is_cpuid_PSE36(void)
#ifdef CONFIG_X86_64
static void __set_spte(u64 *sptep, u64 spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
WRITE_ONCE(*sptep, spte);
}
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
WRITE_ONCE(*sptep, spte);
}
static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
return xchg(sptep, spte);
}
@@ -432,8 +434,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte (mm/gup.c).
*
- * An spte tlb flush may be pending, because kvm_set_pte_rmap
- * coalesces them and we are running out of the MMU lock. Therefore
+ * An spte tlb flush may be pending, because they are coalesced and
+ * we are running out of the MMU lock. Therefore
* we need to protect against in-progress updates of the spte.
*
* Reading the spte while an update is in progress may get the old value
@@ -482,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
__set_spte(sptep, new_spte);
}
-/*
- * Update the SPTE (excluding the PFN), but do not track changes in its
- * accessed/dirty status.
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Returns true if the TLB needs to be flushed
*/
-static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
@@ -495,7 +498,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
- return old_spte;
+ return false;
}
if (!spte_has_volatile_bits(old_spte))
@@ -503,53 +506,10 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
- WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+ WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
- return old_spte;
-}
-
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changed.
- *
- * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
- * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
- * spte, even though the writable spte might be cached on a CPU's TLB.
- *
- * Returns true if the TLB needs to be flushed
- */
-static bool mmu_spte_update(u64 *sptep, u64 new_spte)
-{
- bool flush = false;
- u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
-
- if (!is_shadow_present_pte(old_spte))
- return false;
-
- /*
- * For the spte updated out of mmu-lock is safe, since
- * we always atomically update it, see the comments in
- * spte_has_volatile_bits().
- */
- if (is_mmu_writable_spte(old_spte) &&
- !is_writable_pte(new_spte))
- flush = true;
-
- /*
- * Flush TLB when accessed/dirty states are changed in the page tables,
- * to guarantee consistency between TLB and page tables.
- */
-
- if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
- flush = true;
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
- }
-
- if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
- flush = true;
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
- }
-
- return flush;
+ return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
}
/*
@@ -560,39 +520,19 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
*/
static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
- kvm_pfn_t pfn;
u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level;
- struct page *page;
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
if (!is_shadow_present_pte(old_spte))
return old_spte;
kvm_update_page_stats(kvm, level, -1);
-
- pfn = spte_to_pfn(old_spte);
-
- /*
- * KVM doesn't hold a reference to any pages mapped into the guest, and
- * instead uses the mmu_notifier to ensure that KVM unmaps any pages
- * before they are reclaimed. Sanity check that, if the pfn is backed
- * by a refcounted page, the refcount is elevated.
- */
- page = kvm_pfn_to_refcounted_page(pfn);
- WARN_ON_ONCE(page && !page_count(page));
-
- if (is_accessed_spte(old_spte))
- kvm_set_pfn_accessed(pfn);
-
- if (is_dirty_spte(old_spte))
- kvm_set_pfn_dirty(pfn);
-
return old_spte;
}
@@ -603,7 +543,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
*/
static void mmu_spte_clear_no_track(u64 *sptep)
{
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
}
static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -611,32 +551,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Returns the Accessed status of the PTE and resets it at the same time. */
-static bool mmu_spte_age(u64 *sptep)
-{
- u64 spte = mmu_spte_get_lockless(sptep);
-
- if (!is_accessed_spte(spte))
- return false;
-
- if (spte_ad_enabled(spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(spte))
- kvm_set_pfn_dirty(spte_to_pfn(spte));
-
- spte = mark_spte_for_access_track(spte);
- mmu_spte_update_no_track(sptep, spte);
- }
-
- return true;
-}
-
static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
{
return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
@@ -685,6 +599,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
return r;
+ if (kvm_has_mirrored_tdp(vcpu->kvm)) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL);
if (r)
@@ -704,6 +624,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
@@ -719,7 +640,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
if (sp->role.passthrough)
return sp->gfn;
- if (!sp->role.direct)
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] >> PAGE_SHIFT;
return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
@@ -733,7 +654,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
*/
static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
{
- if (sp_has_gptes(sp))
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] & ACC_ALL;
/*
@@ -754,7 +675,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
gfn_t gfn, unsigned int access)
{
- if (sp_has_gptes(sp)) {
+ if (sp->shadowed_translation) {
sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
return;
}
@@ -831,6 +752,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
gfn_t gfn;
kvm->arch.indirect_shadow_pages++;
+ /*
+ * Ensure indirect_shadow_pages is elevated prior to re-reading guest
+ * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
+ * emulated writes are visible before re-reading guest PTEs, or that
+ * an emulated write will see the elevated count and acquire mmu_lock
+ * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write().
+ */
+ smp_mb();
+
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
@@ -926,6 +856,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* pte_list_desc containing more mappings.
*/
+#define KVM_RMAP_MANY BIT(0)
/*
* Returns the number of pointers in the rmap chain, not counting the new one.
@@ -938,16 +869,16 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
if (!rmap_head->val) {
rmap_head->val = (unsigned long)spte;
- } else if (!(rmap_head->val & 1)) {
+ } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -956,10 +887,10 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
}
desc->sptes[desc->spte_count++] = spte;
}
@@ -970,7 +901,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i)
{
- struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -999,7 +930,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
if (!head_desc->more)
rmap_head->val = 0;
else
- rmap_head->val = (unsigned long)head_desc->more | 1;
+ rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -1012,13 +943,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
return;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
return;
rmap_head->val = 0;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
@@ -1051,12 +982,12 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
if (!rmap_head->val)
return false;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
goto out;
}
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -1076,10 +1007,10 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
if (!rmap_head->val)
return 0;
- else if (!(rmap_head->val & 1))
+ else if (!(rmap_head->val & KVM_RMAP_MANY))
return 1;
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1141,13 +1072,13 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
if (!rmap_head->val)
return NULL;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
iter->desc = NULL;
sptep = (u64 *)rmap_head->val;
goto out;
}
- iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
iter->pos = 0;
sptep = iter->desc->sptes[iter->pos];
out:
@@ -1263,16 +1194,6 @@ static bool spte_clear_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool spte_wrprot_for_clear_dirty(u64 *sptep)
-{
- bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
- (unsigned long *)sptep);
- if (was_writable && !spte_ad_enabled(*sptep))
- kvm_set_pfn_dirty(spte_to_pfn(*sptep));
-
- return was_writable;
-}
-
/*
* Gets the GFN ready for another round of dirty logging by clearing the
* - D bit on ad-enabled SPTEs, and
@@ -1288,22 +1209,14 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
for_each_rmap_spte(rmap_head, &iter, sptep)
if (spte_ad_need_write_protect(*sptep))
- flush |= spte_wrprot_for_clear_dirty(sptep);
+ flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
else
flush |= spte_clear_dirty(sptep);
return flush;
}
-/**
- * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
- * @kvm: kvm instance
- * @slot: slot to protect
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should protect
- *
- * Used when we do not need to care about huge page mappings.
- */
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1327,16 +1240,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
- * protect the page if the D-bit isn't supported.
- * @kvm: kvm instance
- * @slot: slot to clear D-bit
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should clear D-bit
- *
- * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
- */
static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1360,24 +1263,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * PT level pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- *
- * We need to care about huge page mappings: e.g. during dirty logging we may
- * have such mappings.
- */
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
/*
- * Huge pages are NOT write protected when we start dirty logging in
- * initially-all-set mode; must write protect them here so that they
- * are split to 4K on the first write.
+ * If the slot was assumed to be "initially all dirty", write-protect
+ * huge pages to ensure they are split to 4KiB on the first write (KVM
+ * dirty logs at 4KiB granularity). If eager page splitting is enabled,
+ * immediately try to split huge pages, e.g. so that vCPUs don't get
+ * saddled with the cost of splitting.
*
* The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
* of memslot has no such restriction, so the range can cross two large
@@ -1399,7 +1294,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
PG_LEVEL_2M);
}
- /* Now handle 4K PTEs. */
+ /*
+ * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
+ * mask. If PML is enabled and the GFN doesn't need to be write-
+ * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
+ * Otherwise clear the Writable bit.
+ *
+ * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
+ * enabled but it chooses between clearing the Dirty bit and Writeable
+ * bit based on the context.
+ */
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
@@ -1441,54 +1345,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
-static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot)
-{
- return kvm_zap_all_rmap_sptes(kvm, rmap_head);
-}
-
static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
-{
- return __kvm_zap_rmap(kvm, rmap_head, slot);
-}
-
-static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t pte)
+ const struct kvm_memory_slot *slot)
{
- u64 *sptep;
- struct rmap_iterator iter;
- bool need_flush = false;
- u64 new_spte;
- kvm_pfn_t new_pfn;
-
- WARN_ON_ONCE(pte_huge(pte));
- new_pfn = pte_pfn(pte);
-
-restart:
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- need_flush = true;
-
- if (pte_write(pte)) {
- kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- goto restart;
- } else {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- *sptep, new_pfn);
-
- mmu_spte_clear_track_bits(kvm, sptep);
- mmu_spte_set(sptep, new_spte);
- }
- }
-
- if (need_flush && kvm_available_flush_remote_tlbs_range()) {
- kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
- return false;
- }
-
- return need_flush;
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
}
struct slot_rmap_walk_iterator {
@@ -1539,7 +1399,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
{
while (++iterator->rmap <= iterator->end_rmap) {
- iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
if (iterator->rmap->val)
return;
@@ -1560,80 +1420,101 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t pte);
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
-static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
- struct kvm_gfn_range *range,
- rmap_handler_t handler)
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool can_yield, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool ret = false;
- for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- range->start, range->end - 1, &iterator)
- ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level, range->arg.pte);
-
- return ret;
-}
-
-bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- bool flush = false;
+ lockdep_assert_held_write(&kvm->mmu_lock);
- if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap, slot);
- if (tdp_mmu_enabled)
- flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ if (!can_yield)
+ continue;
- if (kvm_x86_ops.set_apic_access_page_addr &&
- range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
- kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush && flush_on_yield) {
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
return flush;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
{
- bool flush = false;
-
- if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ true, flush_on_yield, false);
+}
- if (tdp_mmu_enabled)
- flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
+{
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+}
- return flush;
+static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end, bool can_yield,
+ bool flush)
+{
+ return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, can_yield, true, flush);
}
-static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- u64 *sptep;
- struct rmap_iterator iter;
- int young = 0;
+ bool flush = false;
- for_each_rmap_spte(rmap_head, &iter, sptep)
- young |= mmu_spte_age(sptep);
+ /*
+ * To prevent races with vCPUs faulting in a gfn using stale data,
+ * zapping a gfn range must be protected by mmu_invalidate_in_progress
+ * (and mmu_invalidate_seq). The only exception is memslot deletion;
+ * in that case, SRCU synchronization ensures that SPTEs are zapped
+ * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the
+ * invalid slot.
+ */
+ lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
+ lockdep_is_held(&kvm->slots_lock));
- return young;
-}
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
+ range->start, range->end,
+ range->may_block, flush);
-static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t unused)
-{
- u64 *sptep;
- struct rmap_iterator iter;
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
- for_each_rmap_spte(rmap_head, &iter, sptep)
- if (is_accessed_spte(*sptep))
- return true;
- return false;
+ if (kvm_x86_ops.set_apic_access_page_addr &&
+ range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+
+ return flush;
}
#define RMAP_RECYCLE_THRESHOLD 1000
@@ -1670,12 +1551,49 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
__rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
}
+static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range, bool test_only)
+{
+ struct slot_rmap_walk_iterator iterator;
+ struct rmap_iterator iter;
+ bool young = false;
+ u64 *sptep;
+
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator) {
+ for_each_rmap_spte(iterator.rmap, &iter, sptep) {
+ u64 spte = *sptep;
+
+ if (!is_accessed_spte(spte))
+ continue;
+
+ if (test_only)
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * WARN if mmu_spte_update() signals the need
+ * for a TLB flush, as Access tracking a SPTE
+ * should never trigger an _immediate_ flush.
+ */
+ spte = mark_spte_for_access_track(spte);
+ WARN_ON_ONCE(mmu_spte_update(sptep, spte));
+ }
+ young = true;
+ }
+ }
+ return young;
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, false);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1688,7 +1606,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, true);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@ -1710,27 +1628,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
#endif
}
-/*
- * This value is the sum of all of the kvm instances's
- * kvm->arch.n_used_mmu_pages values. We need a global,
- * aggregate version in order to make the slab shrinker
- * faster
- */
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
-{
- kvm->arch.n_used_mmu_pages += nr;
- percpu_counter_add(&kvm_total_used_mmu_pages, nr);
-}
-
static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- kvm_mod_used_mmu_pages(kvm, +1);
+ kvm->arch.n_used_mmu_pages++;
kvm_account_pgtable_pages((void *)sp->spt, +1);
}
static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- kvm_mod_used_mmu_pages(kvm, -1);
+ kvm->arch.n_used_mmu_pages--;
kvm_account_pgtable_pages((void *)sp->spt, -1);
}
@@ -1741,8 +1647,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
- if (!sp->role.direct)
- free_page((unsigned long)sp->shadowed_translation);
+ free_page((unsigned long)sp->shadowed_translation);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1950,7 +1855,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- if (!sp->spt[i])
+ /* sp->spt[i] has initial value of shadow page table allocation */
+ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
return 0;
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
@@ -2243,7 +2149,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
- if (!role.direct)
+ if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
@@ -2514,7 +2420,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
- } else if (is_mmio_spte(pte)) {
+ } else if (is_mmio_spte(kvm, pte)) {
mmu_spte_clear_no_track(spte);
}
return 0;
@@ -2754,36 +2660,49 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
write_unlock(&kvm->mmu_lock);
}
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry)
{
- struct kvm_mmu_page *sp;
+ struct kvm *kvm = vcpu->kvm;
LIST_HEAD(invalid_list);
- int r;
+ struct kvm_mmu_page *sp;
+ gpa_t gpa = cr2_or_gpa;
+ bool r = false;
+
+ /*
+ * Bail early if there aren't any write-protected shadow pages to avoid
+ * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
+ * by a third party. Reading indirect_shadow_pages without holding
+ * mmu_lock is safe, as this is purely an optimization, i.e. a false
+ * positive is benign, and a false negative will simply result in KVM
+ * skipping the unprotect+retry path, which is also an optimization.
+ */
+ if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
+ goto out;
+
+ if (!vcpu->arch.mmu->root_role.direct) {
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+ if (gpa == INVALID_GPA)
+ goto out;
+ }
- r = 0;
write_lock(&kvm->mmu_lock);
- for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- r = 1;
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
+
+ /*
+ * Snapshot the result before zapping, as zapping will remove all list
+ * entries, i.e. checking the list later would yield a false negative.
+ */
+ r = !list_empty(&invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
write_unlock(&kvm->mmu_lock);
- return r;
-}
-
-static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- if (vcpu->arch.mmu->root_role.direct)
- return 0;
-
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
+out:
+ if (r || always_retry) {
+ vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+ }
return r;
}
@@ -2803,7 +2722,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
* be write-protected.
*/
int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
- gfn_t gfn, bool can_unsync, bool prefetch)
+ gfn_t gfn, bool synchronizing, bool prefetch)
{
struct kvm_mmu_page *sp;
bool locked = false;
@@ -2818,12 +2737,12 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
/*
* The page is not write-tracked, mark existing shadow pages unsync
- * unless KVM is synchronizing an unsync SP (can_unsync = false). In
- * that case, KVM must complete emulation of the guest TLB flush before
- * allowing shadow pages to become unsync (writable by the guest).
+ * unless KVM is synchronizing an unsync SP. In that case, KVM must
+ * complete emulation of the guest TLB flush before allowing shadow
+ * pages to become unsync (writable by the guest).
*/
for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- if (!can_unsync)
+ if (synchronizing)
return -EPERM;
if (sp->unsync)
@@ -2927,6 +2846,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (is_shadow_present_pte(*sptep)) {
+ if (prefetch)
+ return RET_PF_SPURIOUS;
+
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
@@ -2946,7 +2868,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
- true, host_writable, &spte);
+ false, host_writable, &spte);
if (*sptep == spte) {
ret = RET_PF_SPURIOUS;
@@ -2955,10 +2877,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
trace_kvm_mmu_set_spte(level, gfn, sptep);
}
- if (wrprot) {
- if (write_fault)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && write_fault)
+ ret = RET_PF_WRITE_PROTECTED;
if (flush)
kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
@@ -2974,32 +2894,51 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
return ret;
}
-static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *start, u64 *end)
+static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep,
+ int nr_pages, unsigned int access)
{
struct page *pages[PTE_PREFETCH_NUM];
struct kvm_memory_slot *slot;
- unsigned int access = sp->role.access;
- int i, ret;
- gfn_t gfn;
+ int i;
+
+ if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM))
+ return false;
- gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
if (!slot)
- return -1;
+ return false;
- ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
- if (ret <= 0)
- return -1;
+ nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages);
+ if (nr_pages <= 0)
+ return false;
- for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, slot, start, access, gfn,
+ for (i = 0; i < nr_pages; i++, gfn++, sptep++) {
+ mmu_set_spte(vcpu, slot, sptep, access, gfn,
page_to_pfn(pages[i]), NULL);
- put_page(pages[i]);
+
+ /*
+ * KVM always prefetches writable pages from the primary MMU,
+ * and KVM can make its SPTE writable in the fast page handler,
+ * without notifying the primary MMU. Mark pages/folios dirty
+ * now to ensure file data is written back if it ends up being
+ * written by the guest. Because KVM's prefetching GUPs
+ * writable PTEs, the probability of unnecessary writeback is
+ * extremely low.
+ */
+ kvm_release_page_dirty(pages[i]);
}
- return 0;
+ return true;
+}
+
+static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
+ unsigned int access = sp->role.access;
+
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access);
}
static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
@@ -3017,8 +2956,9 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
if (is_shadow_present_pte(*spte) || spte == sptep) {
if (!start)
continue;
- if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ if (!direct_pte_prefetch_many(vcpu, sp, start, spte))
return;
+
start = NULL;
} else if (!start)
start = spte;
@@ -3110,7 +3050,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
/*
* Read each entry once. As above, a non-leaf entry can be promoted to
* a huge page _during_ this walk. Re-reading the entry could send the
- * walk into the weeks, e.g. p*d_large() returns false (sees the old
+ * walk into the weeks, e.g. p*d_leaf() returns false (sees the old
* value) and then p*d_offset() walks into the target huge page instead
* of the old page table (sees the new value).
*/
@@ -3126,7 +3066,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
if (pud_none(pud) || !pud_present(pud))
goto out;
- if (pud_large(pud)) {
+ if (pud_leaf(pud)) {
level = PG_LEVEL_1G;
goto out;
}
@@ -3135,7 +3075,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
if (pmd_none(pmd) || !pmd_present(pmd))
goto out;
- if (pmd_large(pmd))
+ if (pmd_leaf(pmd))
level = PG_LEVEL_2M;
out:
@@ -3168,13 +3108,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
}
int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn,
- int max_level)
+ const struct kvm_memory_slot *slot, gfn_t gfn)
{
bool is_private = kvm_slot_can_be_private(slot) &&
kvm_mem_is_private(kvm, gfn);
- return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+ return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
}
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -3314,9 +3253,18 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
{
gva_t gva = fault->is_tdp ? 0 : fault->addr;
+ if (fault->is_private) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
access & shadow_mmio_access_mask);
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
@@ -3338,7 +3286,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
return RET_PF_CONTINUE;
}
-static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
+static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
{
/*
* Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
@@ -3350,6 +3298,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
return false;
/*
+ * For hardware-protected VMs, certain conditions like attempting to
+ * perform a write to a page which is not in the state that the guest
+ * expects it to be in can result in a nested/extended #PF. In this
+ * case, the below code might misconstrue this situation as being the
+ * result of a write-protected access, and treat it as a spurious case
+ * rather than taking any action to satisfy the real source of the #PF
+ * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ * guest spinning on a #PF indefinitely, so don't attempt the fast path
+ * in this case.
+ *
+ * Note that the kvm_mem_is_private() check might race with an
+ * attribute update, but this will either result in the guest spinning
+ * on RET_PF_SPURIOUS until the update completes, or an actual spurious
+ * case might go down the slow path. Either case will resolve itself.
+ */
+ if (kvm->arch.has_private_mem &&
+ fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
+ return false;
+
+ /*
* #PF can be fast if:
*
* 1. The shadow page table entry is not present and A/D bits are
@@ -3365,7 +3333,7 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
* by setting the Writable bit, which can be done out of mmu_lock.
*/
if (!fault->present)
- return !kvm_ad_enabled();
+ return !kvm_ad_enabled;
/*
* Note, instruction fetches and writes are mutually exclusive, ignore
@@ -3392,7 +3360,7 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
* harm. This also avoids the TLB flush needed after setting dirty bit
* so non-PML cases won't be impacted.
*
- * Compare with set_spte where instead shadow_dirty_mask is set.
+ * Compare with make_spte() where instead shadow_dirty_mask is set.
*/
if (!try_cmpxchg64(sptep, &old_spte, new_spte))
return false;
@@ -3403,18 +3371,6 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
return true;
}
-static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
-{
- if (fault->exec)
- return is_executable_pte(spte);
-
- if (fault->write)
- return is_writable_pte(spte);
-
- /* Fault was on Read access */
- return spte & PT_PRESENT_MASK;
-}
-
/*
* Returns the last level spte pointer of the shadow page walk for the given
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
@@ -3449,7 +3405,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 *sptep;
uint retry_count = 0;
- if (!page_fault_can_be_fast(fault))
+ if (!page_fault_can_be_fast(vcpu->kvm, fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3458,7 +3414,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 new_spte;
if (tdp_mmu_enabled)
- sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
@@ -3468,7 +3424,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* available as the vCPU holds a reference to its root(s).
*/
if (WARN_ON_ONCE(!sptep))
- spte = REMOVED_SPTE;
+ spte = FROZEN_SPTE;
if (!is_shadow_present_pte(spte))
break;
@@ -3500,8 +3456,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* uses A/D bits for non-nested MMUs. Thus, if A/D bits are
* enabled, the SPTE can't be an access-tracked SPTE.
*/
- if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
- new_spte = restore_acc_track_spte(new_spte);
+ if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte) |
+ shadow_accessed_mask;
/*
* To keep things simple, only SPTEs that are MMU-writable can
@@ -3575,10 +3532,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (WARN_ON_ONCE(!sp))
return;
- if (is_tdp_mmu_page(sp))
+ if (is_tdp_mmu_page(sp)) {
+ lockdep_assert_held_read(&kvm->mmu_lock);
kvm_tdp_mmu_put_root(kvm, sp);
- else if (!--sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ } else {
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ }
*root_hpa = INVALID_PAGE;
}
@@ -3587,6 +3548,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
{
+ bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
int i;
LIST_HEAD(invalid_list);
bool free_active_root;
@@ -3609,7 +3571,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
return;
}
- write_lock(&kvm->mmu_lock);
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3635,8 +3600,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
mmu->root.pgd = 0;
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu) {
+ read_unlock(&kvm->mmu_lock);
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ } else {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+ }
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3693,15 +3663,20 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
unsigned i;
int r;
+ if (tdp_mmu_enabled) {
+ if (kvm_has_mirrored_tdp(vcpu->kvm) &&
+ !VALID_PAGE(mmu->mirror_root_hpa))
+ kvm_tdp_mmu_alloc_root(vcpu, true);
+ kvm_tdp_mmu_alloc_root(vcpu, false);
+ return 0;
+ }
+
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
if (r < 0)
goto out_unlock;
- if (tdp_mmu_enabled) {
- root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
- mmu->root.hpa = root;
- } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ if (shadow_root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
@@ -4121,23 +4096,31 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
return leaf;
}
-/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
-static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
{
- u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
- struct rsvd_bits_validate *rsvd_check;
- int root, leaf, level;
- bool reserved = false;
+ int leaf;
walk_shadow_page_lockless_begin(vcpu);
if (is_tdp_mmu_active(vcpu))
- leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
+ leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level);
else
- leaf = get_walk(vcpu, addr, sptes, &root);
+ leaf = get_walk(vcpu, addr, sptes, root_level);
walk_shadow_page_lockless_end(vcpu);
+ return leaf;
+}
+/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
+static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ struct rsvd_bits_validate *rsvd_check;
+ int root, leaf, level;
+ bool reserved = false;
+
+ leaf = get_sptes_lockless(vcpu, addr, sptes, &root);
if (unlikely(leaf < 0)) {
*sptep = 0ull;
return reserved;
@@ -4183,7 +4166,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (WARN_ON_ONCE(reserved))
return -EINVAL;
- if (is_mmio_spte(spte)) {
+ if (is_mmio_spte(vcpu->kvm, spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned int access = get_mmio_spte_access(spte);
@@ -4246,24 +4229,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
}
-static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- gfn_t gfn)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
struct kvm_arch_async_pf arch;
arch.token = alloc_apf_token(vcpu);
- arch.gfn = gfn;
+ arch.gfn = fault->gfn;
+ arch.error_code = fault->error_code;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
- return kvm_setup_async_pf(vcpu, cr2_or_gpa,
- kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, fault->addr,
+ kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
}
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
+ if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ return;
+
if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
work->wakeup_all)
return;
@@ -4276,7 +4263,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
+ r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ true, NULL, NULL);
+
+ /*
+ * Account fixed page faults, otherwise they'll never be counted, but
+ * ignore stats for all other return times. Page-ready "faults" aren't
+ * truly spurious and never trigger emulation
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
}
static inline u8 kvm_max_level_for_order(int order)
@@ -4296,16 +4292,34 @@ static inline u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
-static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
+static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
+ u8 max_level, int gmem_order)
+{
+ u8 req_max_level;
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ max_level = min(kvm_max_level_for_order(gmem_order), max_level);
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
+ if (req_max_level)
+ max_level = min(max_level, req_max_level);
+
+ return max_level;
+}
+
+static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int r)
{
- kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
- PAGE_SIZE, fault->write, fault->exec,
- fault->is_private);
+ kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page,
+ r == RET_PF_RETRY, fault->map_writable);
}
-static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
+static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
int max_order, r;
@@ -4315,65 +4329,39 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
}
r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
- &max_order);
+ &fault->refcounted_page, &max_order);
if (r) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return r;
}
- fault->max_level = min(kvm_max_level_for_order(max_order),
- fault->max_level);
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+ fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
+ fault->max_level, max_order);
return RET_PF_CONTINUE;
}
-static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot = fault->slot;
- bool async;
-
- /*
- * Retry the page fault if the gfn hit a memslot that is being deleted
- * or moved. This ensures any existing SPTEs for the old memslot will
- * be zapped before KVM inserts a new MMIO SPTE for the gfn.
- */
- if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
- return RET_PF_RETRY;
-
- if (!kvm_is_visible_memslot(slot)) {
- /* Don't expose private memslots to L2. */
- if (is_guest_mode(vcpu)) {
- fault->slot = NULL;
- fault->pfn = KVM_PFN_NOSLOT;
- fault->map_writable = false;
- return RET_PF_CONTINUE;
- }
- /*
- * If the APIC access page exists but is disabled, go directly
- * to emulation without caching the MMIO access or creating a
- * MMIO SPTE. That way the cache doesn't need to be purged
- * when the AVIC is re-enabled.
- */
- if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
- !kvm_apicv_activated(vcpu->kvm))
- return RET_PF_EMULATE;
- }
-
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
- return -EFAULT;
- }
+ unsigned int foll = fault->write ? FOLL_WRITE : 0;
if (fault->is_private)
- return kvm_faultin_pfn_private(vcpu, fault);
+ return kvm_mmu_faultin_pfn_private(vcpu, fault);
- async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
- fault->write, &fault->map_writable,
- &fault->hva);
- if (!async)
- return RET_PF_CONTINUE; /* *pfn has correct page already */
+ foll |= FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
+ /*
+ * If resolving the page failed because I/O is needed to fault-in the
+ * page, then either set up an asynchronous #PF to do the I/O, or if
+ * doing an async #PF isn't possible, retry with I/O allowed. All
+ * other failures are terminal, i.e. retrying won't help.
+ */
+ if (fault->pfn != KVM_PFN_ERR_NEEDS_IO)
+ return RET_PF_CONTINUE;
if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(fault->addr, fault->gfn);
@@ -4381,7 +4369,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return RET_PF_RETRY;
- } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+ } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
return RET_PF_RETRY;
}
}
@@ -4391,21 +4379,79 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* to wait for IO. Note, gup always bails if it is unable to quickly
* get a page and a fatal signal, i.e. SIGKILL, is pending.
*/
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
- fault->write, &fault->map_writable,
- &fault->hva);
+ foll |= FOLL_INTERRUPTIBLE;
+ foll &= ~FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
return RET_PF_CONTINUE;
}
-static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- unsigned int access)
+static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, unsigned int access)
{
+ struct kvm_memory_slot *slot = fault->slot;
+ struct kvm *kvm = vcpu->kvm;
int ret;
+ if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm))
+ return -EFAULT;
+
+ /*
+ * Note that the mmu_invalidate_seq also serves to detect a concurrent
+ * change in attributes. is_page_fault_stale() will detect an
+ * invalidation relate to fault->fn and resume the guest without
+ * installing a mapping in the page tables.
+ */
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
/*
+ * Now that we have a snapshot of mmu_invalidate_seq we can check for a
+ * private vs. shared mismatch.
+ */
+ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (unlikely(!slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot->flags & KVM_MEMSLOT_INVALID)
+ return RET_PF_RETRY;
+
+ if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
+ /*
+ * Don't map L1's APIC access page into L2, KVM doesn't support
+ * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
+ * i.e. the access needs to be emulated. Emulating access to
+ * L1's APIC is also correct if L1 is accelerating L2's own
+ * virtual APIC, but for some reason L1 also maps _L1's_ APIC
+ * into L2. Note, vcpu_is_mmio_gpa() always treats access to
+ * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
+ * uses different roots for L1 vs. L2, i.e. there is no danger
+ * of breaking APICv/AVIC for L1.
+ */
+ if (is_guest_mode(vcpu))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (!kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
+ }
+
+ /*
* Check for a relevant mmu_notifier invalidation event before getting
* the pfn from the primary MMU, and before acquiring mmu_lock.
*
@@ -4426,18 +4472,17 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
- if (fault->slot &&
- mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
- ret = __kvm_faultin_pfn(vcpu, fault);
+ ret = __kvm_mmu_faultin_pfn(vcpu, fault);
if (ret != RET_PF_CONTINUE)
return ret;
if (unlikely(is_error_pfn(fault->pfn)))
return kvm_handle_error_pfn(vcpu, fault);
- if (unlikely(!fault->slot))
+ if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
return kvm_handle_noslot_fault(vcpu, fault, access);
/*
@@ -4447,8 +4492,8 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* overall cost of failing to detect the invalidation until after
* mmu_lock is acquired.
*/
- if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
- kvm_release_pfn_clean(fault->pfn);
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) {
+ kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
return RET_PF_RETRY;
}
@@ -4497,7 +4542,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_RETRY;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4507,7 +4552,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
return r;
@@ -4524,8 +4569,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = direct_map(vcpu, fault);
out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -4548,13 +4593,24 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (WARN_ON_ONCE(fault_address >> 32))
return -EFAULT;
#endif
+ /*
+ * Legacy #PF exception only have a 32-bit error code. Simply drop the
+ * upper bits as KVM doesn't use them for #PF (because they are never
+ * set), and to ensure there are no collisions with KVM-defined bits.
+ */
+ if (WARN_ON_ONCE(error_code >> 32))
+ error_code = lower_32_bits(error_code);
+
+ /*
+ * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
+ * them to conflict with #PF error codes, which are limited to 32 bits.
+ */
+ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
vcpu->arch.l1tf_flush_l1d = true;
if (!flags) {
trace_kvm_page_fault(vcpu, fault_address, error_code);
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
@@ -4577,7 +4633,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
int r;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4587,7 +4643,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
if (r)
return r;
- r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
return r;
@@ -4600,44 +4656,27 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
r = kvm_tdp_mmu_map(vcpu, fault);
out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
read_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
return r;
}
#endif
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
+bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
- * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
- * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
- * to honor the memtype from the guest's MTRRs so that guest accesses
- * to memory that is DMA'd aren't cached against the guest's wishes.
- *
- * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
- * e.g. KVM will force UC memtype for host MMIO.
+ * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
+ * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
+ * honor the memtype from the guest's PAT so that guest accesses to
+ * memory that is DMA'd aren't cached against the guest's wishes. As a
+ * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
+ * KVM _always_ ignores guest PAT (when EPT is enabled).
*/
- return vm_has_noncoherent_dma && shadow_memtype_mask;
+ return shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- /*
- * If the guest's MTRRs may be used to compute the "real" memtype,
- * restrict the mapping level to ensure KVM uses a consistent memtype
- * across the entire mapping.
- */
- if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
- for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
- int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = gfn_round_for_level(fault->gfn,
- fault->max_level);
-
- if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
- break;
- }
- }
-
#ifdef CONFIG_X86_64
if (tdp_mmu_enabled)
return kvm_tdp_mmu_page_fault(vcpu, fault);
@@ -4646,6 +4685,85 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
+static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+ u8 *level)
+{
+ int r;
+
+ /*
+ * Restrict to TDP page fault, since that's the only case where the MMU
+ * is indexed by GPA.
+ */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ } while (r == RET_PF_RETRY);
+
+ if (r < 0)
+ return r;
+
+ switch (r) {
+ case RET_PF_FIXED:
+ case RET_PF_SPURIOUS:
+ case RET_PF_WRITE_PROTECTED:
+ return 0;
+
+ case RET_PF_EMULATE:
+ return -ENOENT;
+
+ case RET_PF_RETRY:
+ case RET_PF_CONTINUE:
+ case RET_PF_INVALID:
+ default:
+ WARN_ONCE(1, "could not fix page fault during prefault");
+ return -EIO;
+ }
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK;
+ u8 level = PG_LEVEL_4K;
+ u64 end;
+ int r;
+
+ if (!vcpu->kvm->arch.pre_fault_allowed)
+ return -EOPNOTSUPP;
+
+ /*
+ * reload is efficient when called repeatedly, so we can do it on
+ * every iteration.
+ */
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ /*
+ * Shadow paging uses GVA for kvm page fault, so restrict to
+ * two-dimensional paging.
+ */
+ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ if (r < 0)
+ return r;
+
+ /*
+ * If the mapping that covers range->gpa can use a huge page, it
+ * may start below it or end after range->gpa + range->size.
+ */
+ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ return min(range->size, end - range->gpa);
+}
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -4799,7 +4917,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
- if (unlikely(is_mmio_spte(*sptep))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
mmu_spte_clear_no_track(sptep);
return true;
@@ -4920,9 +5038,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
context->cpu_role.base.level, is_efer_nx(context),
- guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
- guest_cpuid_is_amd_or_hygon(vcpu));
+ guest_cpuid_is_amd_compatible(vcpu));
}
static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
@@ -4973,7 +5091,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
static inline u64 reserved_hpa_bits(void)
{
- return rsvd_bits(shadow_phys_bits, 63);
+ return rsvd_bits(kvm_host.maxphyaddr, 63);
}
/*
@@ -4997,7 +5115,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
- guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
is_pse, is_amd);
if (!shadow_me_mask)
@@ -5309,6 +5427,11 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
return max_tdp_level;
}
+u8 kvm_mmu_get_max_tdp_level(void)
+{
+ return tdp_root_level ? tdp_root_level : max_tdp_level;
+}
+
static union kvm_mmu_page_role
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
union kvm_cpu_role cpu_role)
@@ -5320,7 +5443,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.efer_nx = true;
role.smm = cpu_role.base.smm;
role.guest_mode = cpu_role.base.guest_mode;
- role.ad_disabled = !kvm_ad_enabled();
+ role.ad_disabled = !kvm_ad_enabled;
role.level = kvm_mmu_get_tdp_level(vcpu);
role.direct = true;
role.has_4_byte_gpte = false;
@@ -5417,7 +5540,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
union kvm_mmu_page_role root_role;
/* NPT requires CR0.PG=1. */
- WARN_ON_ONCE(cpu_role.base.direct);
+ WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
root_role = cpu_role.base;
root_role.level = kvm_mmu_get_tdp_level(vcpu);
@@ -5563,9 +5686,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* that problem is swept under the rug; KVM's CPUID API is horrific and
* it's all but impossible to solve it without introducing a new API.
*/
- vcpu->arch.root_mmu.root_role.word = 0;
- vcpu->arch.guest_mmu.root_role.word = 0;
- vcpu->arch.nested_mmu.root_role.word = 0;
+ vcpu->arch.root_mmu.root_role.invalid = 1;
+ vcpu->arch.guest_mmu.root_role.invalid = 1;
+ vcpu->arch.nested_mmu.root_role.invalid = 1;
vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
@@ -5613,7 +5736,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
* stale entries. Flushing on alloc also allows KVM to skip the TLB
* flush when freeing a root (see kvm_tdp_mmu_put_root()).
*/
- static_call(kvm_x86_flush_tlb_current)(vcpu);
+ kvm_x86_call(flush_tlb_current)(vcpu);
out:
return r;
}
@@ -5789,10 +5912,15 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
bool flush = false;
/*
- * If we don't have indirect shadow pages, it means no page is
- * write-protected, so we can exit simply.
+ * When emulating guest writes, ensure the written value is visible to
+ * any task that is handling page faults before checking whether or not
+ * KVM is shadowing a guest PTE. This ensures either KVM will create
+ * the correct SPTE in the page fault handler, or this task will see
+ * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in
+ * account_shadowed().
*/
- if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ smp_mb();
+ if (!vcpu->kvm->arch.indirect_shadow_pages)
return;
write_lock(&vcpu->kvm->mmu_lock);
@@ -5827,78 +5955,195 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
write_unlock(&vcpu->kvm->mmu_lock);
}
-int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
- void *insn, int insn_len)
+static bool is_write_to_guest_page_table(u64 error_code)
+{
+ const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
+
+ return (error_code & mask) == mask;
+}
+
+static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 error_code, int *emulation_type)
{
- int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->root_role.direct;
/*
- * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
- * checks when emulating instructions that triggers implicit access.
- * WARN if hardware generates a fault with an error code that collides
- * with the KVM-defined value. Clear the flag and continue on, i.e.
- * don't terminate the VM, as KVM can't possibly be relying on a flag
- * that KVM doesn't know about.
+ * Do not try to unprotect and retry if the vCPU re-faulted on the same
+ * RIP with the same address that was previously unprotected, as doing
+ * so will likely put the vCPU into an infinite. E.g. if the vCPU uses
+ * a non-page-table modifying instruction on the PDE that points to the
+ * instruction, then unprotecting the gfn will unmap the instruction's
+ * code, i.e. make it impossible for the instruction to ever complete.
+ */
+ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
+ vcpu->arch.last_retry_addr == cr2_or_gpa)
+ return RET_PF_EMULATE;
+
+ /*
+ * Reset the unprotect+retry values that guard against infinite loops.
+ * The values will be refreshed if KVM explicitly unprotects a gfn and
+ * retries, in all other cases it's safe to retry in the future even if
+ * the next page fault happens on the same RIP+address.
+ */
+ vcpu->arch.last_retry_eip = 0;
+ vcpu->arch.last_retry_addr = 0;
+
+ /*
+ * It should be impossible to reach this point with an MMIO cache hit,
+ * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+ * writable memslot, and creating a memslot should invalidate the MMIO
+ * cache by way of changing the memslot generation. WARN and disallow
+ * retry if MMIO is detected, as retrying MMIO emulation is pointless
+ * and could put the vCPU into an infinite loop because the processor
+ * will keep faulting on the non-existent MMIO address.
*/
- if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
- error_code &= ~PFERR_IMPLICIT_ACCESS;
+ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+ return RET_PF_EMULATE;
+
+ /*
+ * Before emulating the instruction, check to see if the access was due
+ * to a read-only violation while the CPU was walking non-nested NPT
+ * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
+ * If L1 is sharing (a subset of) its page tables with L2, e.g. by
+ * having nCR3 share lower level page tables with hCR3, then when KVM
+ * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
+ * unknowingly write-protecting L1's guest page tables, which KVM isn't
+ * shadowing.
+ *
+ * Because the CPU (by default) walks NPT page tables using a write
+ * access (to ensure the CPU can do A/D updates), page walks in L1 can
+ * trigger write faults for the above case even when L1 isn't modifying
+ * PTEs. As a result, KVM will unnecessarily emulate (or at least, try
+ * to emulate) an excessive number of L1 instructions; because L1's MMU
+ * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
+ * and thus no need to emulate in order to guarantee forward progress.
+ *
+ * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
+ * proceed without triggering emulation. If one or more shadow pages
+ * was zapped, skip emulation and resume L1 to let it natively execute
+ * the instruction. If no shadow pages were zapped, then the write-
+ * fault is due to something else entirely, i.e. KVM needs to emulate,
+ * as resuming the guest will put it into an infinite loop.
+ *
+ * Note, this code also applies to Intel CPUs, even though it is *very*
+ * unlikely that an L1 will share its page tables (IA32/PAE/paging64
+ * format) with L2's page tables (EPT format).
+ *
+ * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
+ * unprotect the gfn and retry if an event is awaiting reinjection. If
+ * KVM emulates multiple instructions before completing event injection,
+ * the event could be delayed beyond what is architecturally allowed,
+ * e.g. KVM could inject an IRQ after the TPR has been raised.
+ */
+ if (((direct && is_write_to_guest_page_table(error_code)) ||
+ (!direct && kvm_event_needs_reinjection(vcpu))) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
+ return RET_PF_RETRY;
+
+ /*
+ * The gfn is write-protected, but if KVM detects its emulating an
+ * instruction that is unlikely to be used to modify page tables, or if
+ * emulation fails, KVM can try to unprotect the gfn and let the CPU
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying an instruction from a nested guest as KVM is only explicitly
+ * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+ * going to magically fix whatever issue caused L2 to fail.
+ */
+ if (!is_guest_mode(vcpu))
+ *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+
+ return RET_PF_EMULATE;
+}
+
+int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = EMULTYPE_PF;
+ bool direct = vcpu->arch.mmu->root_role.direct;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
+ /*
+ * Except for reserved faults (emulated MMIO is shared-only), set the
+ * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
+ * current attributes, which are the source of truth for such VMs. Note,
+ * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
+ * currently supported nested virtualization (among many other things)
+ * for software-protected VMs.
+ */
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ !(error_code & PFERR_RSVD_MASK) &&
+ vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
+ return -EFAULT;
+
r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
if (r == RET_PF_EMULATE)
goto emulate;
}
if (r == RET_PF_INVALID) {
- r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false,
- &emulation_type);
+ vcpu->stat.pf_taken++;
+
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
+ &emulation_type, NULL);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
if (r < 0)
return r;
- if (r != RET_PF_EMULATE)
- return 1;
+
+ if (r == RET_PF_WRITE_PROTECTED)
+ r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+ &emulation_type);
+
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
/*
- * Before emulating the instruction, check if the error code
- * was due to a RO violation while translating the guest page.
- * This can occur when using nested virtualization with nested
- * paging in both guests. If true, we simply unprotect the page
- * and resume the guest.
+ * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or
+ * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE.
+ * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to
+ * indicate continuing the page fault handling until to the final
+ * page table mapping phase.
*/
- if (vcpu->arch.mmu->root_role.direct &&
- (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
- return 1;
- }
+ WARN_ON_ONCE(r == RET_PF_CONTINUE);
+ if (r != RET_PF_EMULATE)
+ return r;
- /*
- * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
- * optimistically try to just unprotect the page and let the processor
- * re-execute the instruction that caused the page fault. Do not allow
- * retrying MMIO emulation, as it's not only pointless but could also
- * cause us to enter an infinite loop because the processor will keep
- * faulting on the non-existent MMIO address. Retrying an instruction
- * from a nested guest is also pointless and dangerous as we are only
- * explicitly shadowing L1's page tables, i.e. unprotecting something
- * for L1 isn't going to magically fix whatever issue cause L2 to fail.
- */
- if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
- emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ int root_level, leaf, level;
+
+ leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level);
+ if (unlikely(leaf < 0))
+ return;
+
+ pr_err("%s %llx", msg, gpa);
+ for (level = root_level; level >= leaf; level--)
+ pr_cont(", spte[%d] = 0x%llx", level, sptes[level]);
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes);
+
static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 addr, hpa_t root_hpa)
{
@@ -5946,10 +6191,10 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
/* It's actually a GPA for vcpu->arch.guest_mmu. */
if (mmu != &vcpu->arch.guest_mmu) {
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
- if (is_noncanonical_address(addr, vcpu))
+ if (is_noncanonical_invlpg_address(addr, vcpu))
return;
- static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
+ kvm_x86_call(flush_tlb_gva)(vcpu, addr);
}
if (!mmu->sync_spte)
@@ -6035,59 +6280,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
-/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot);
-
-static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn,
- bool flush_on_yield, bool flush)
-{
- struct slot_rmap_walk_iterator iterator;
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
- end_gfn, &iterator) {
- if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap, slot);
-
- if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && flush_on_yield) {
- kvm_flush_remote_tlbs_range(kvm, start_gfn,
- iterator.gfn - start_gfn + 1);
- flush = false;
- }
- cond_resched_rwlock_write(&kvm->mmu_lock);
- }
- }
-
- return flush;
-}
-
-static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- bool flush_on_yield)
-{
- return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
- slot->base_gfn, slot->base_gfn + slot->npages - 1,
- flush_on_yield, false);
-}
-
-static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- bool flush_on_yield)
-{
- return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
-}
-
static void free_mmu_pages(struct kvm_mmu *mmu)
{
if (!tdp_enabled && mmu->pae_root)
@@ -6104,6 +6296,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
mmu->root.hpa = INVALID_PAGE;
mmu->root.pgd = 0;
+ mmu->mirror_root_hpa = INVALID_PAGE;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -6160,7 +6353,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
- vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ vcpu->arch.mmu_shadow_page_cache.init_value =
+ SHADOW_NONPRESENT_VALUE;
+ if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -6184,8 +6380,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
int nr_zapped, batch = 0;
+ LIST_HEAD(invalid_list);
bool unstable;
+ lockdep_assert_held(&kvm->slots_lock);
+
restart:
list_for_each_entry_safe_reverse(sp, node,
&kvm->arch.active_mmu_pages, link) {
@@ -6217,7 +6416,7 @@ restart:
}
unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
- &kvm->arch.zapped_obsolete_pages, &nr_zapped);
+ &invalid_list, &nr_zapped);
batch += nr_zapped;
if (unstable)
@@ -6233,7 +6432,7 @@ restart:
* kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
* running with an obsolete MMU.
*/
- kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
/*
@@ -6267,8 +6466,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* write and in the same critical section as making the reload request,
* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
- if (tdp_mmu_enabled)
- kvm_tdp_mmu_invalidate_all_roots(kvm);
+ if (tdp_mmu_enabled) {
+ /*
+ * External page tables don't support fast zapping, therefore
+ * their mirrors must be invalidated separately by the caller.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS);
+ }
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
@@ -6293,18 +6497,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* lead to use-after-free.
*/
if (tdp_mmu_enabled)
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
-}
-
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
-{
- return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
void kvm_mmu_init_vm(struct kvm *kvm)
{
+ kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
@@ -6357,9 +6556,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (WARN_ON_ONCE(start >= end))
continue;
- flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
+ end, true, flush);
}
}
@@ -6539,7 +6737,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
continue;
}
- spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
+ spte = make_small_spte(kvm, huge_spte, sp->role, index);
mmu_spte_set(sptep, spte);
__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
}
@@ -6647,7 +6845,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
*/
for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
- level, level, start, end - 1, true, false);
+ level, level, start, end - 1, true, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */
@@ -6722,8 +6920,7 @@ restart:
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct &&
- sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
- PG_LEVEL_NUM)) {
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range())
@@ -6737,6 +6934,7 @@ restart:
return need_tlb_flush;
}
+EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -6750,8 +6948,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
kvm_flush_remote_tlbs_memslot(kvm, slot);
}
-void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
@@ -6761,7 +6959,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
- kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+ kvm_tdp_mmu_recover_huge_pages(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -6825,10 +7023,70 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
kvm_mmu_zap_all(kvm);
}
+static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ bool flush)
+{
+ LIST_HEAD(invalid_list);
+ unsigned long i;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ goto out_flush;
+
+ /*
+ * Since accounting information is stored in struct kvm_arch_memory_slot,
+ * all MMU pages that are shadowing guest PTEs must be zapped before the
+ * memslot is deleted, as freeing such pages after the memslot is freed
+ * will result in use-after-free, e.g. in unaccount_shadowed().
+ */
+ for (i = 0; i < slot->npages; i++) {
+ struct kvm_mmu_page *sp;
+ gfn_t gfn = slot->base_gfn + i;
+
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn)
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ flush = false;
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+out_flush:
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+}
+
+static void kvm_mmu_zap_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ struct kvm_gfn_range range = {
+ .slot = slot,
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .may_block = true,
+ };
+ bool flush;
+
+ write_lock(&kvm->mmu_lock);
+ flush = kvm_unmap_gfn_range(kvm, &range);
+ kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush);
+ write_unlock(&kvm->mmu_lock);
+}
+
+static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+}
+
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_zap_all_fast(kvm);
+ if (kvm_memslot_flush_zap_all(kvm))
+ kvm_mmu_zap_all_fast(kvm);
+ else
+ kvm_mmu_zap_memslot(kvm, slot);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -6856,77 +7114,23 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
}
}
-static unsigned long mmu_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static void mmu_destroy_caches(void)
{
- struct kvm *kvm;
- int nr_to_scan = sc->nr_to_scan;
- unsigned long freed = 0;
-
- mutex_lock(&kvm_lock);
-
- list_for_each_entry(kvm, &vm_list, vm_list) {
- int idx;
- LIST_HEAD(invalid_list);
-
- /*
- * Never scan more than sc->nr_to_scan VM instances.
- * Will not hit this condition practically since we do not try
- * to shrink more than one VM and it is very unlikely to see
- * !n_used_mmu_pages so many times.
- */
- if (!nr_to_scan--)
- break;
- /*
- * n_used_mmu_pages is accessed without holding kvm->mmu_lock
- * here. We may skip a VM instance errorneosly, but we do not
- * want to shrink a VM that only started to populate its MMU
- * anyway.
- */
- if (!kvm->arch.n_used_mmu_pages &&
- !kvm_has_zapped_obsolete_pages(kvm))
- continue;
-
- idx = srcu_read_lock(&kvm->srcu);
- write_lock(&kvm->mmu_lock);
-
- if (kvm_has_zapped_obsolete_pages(kvm)) {
- kvm_mmu_commit_zap_page(kvm,
- &kvm->arch.zapped_obsolete_pages);
- goto unlock;
- }
-
- freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
-
-unlock:
- write_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
-
- /*
- * unfair on small ones
- * per-vm shrinkers cry out
- * sadness comes quickly
- */
- list_move_tail(&kvm->vm_list, &vm_list);
- break;
- }
-
- mutex_unlock(&kvm_lock);
- return freed;
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
}
-static unsigned long mmu_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
{
- return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
-}
-
-static struct shrinker *mmu_shrinker;
+ /*
+ * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+ * may not be valid even though the VM is globally visible. Do nothing,
+ * as such a VM can't have any possible NX huge pages.
+ */
+ struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
-static void mmu_destroy_caches(void)
-{
- kmem_cache_destroy(pte_list_desc_cache);
- kmem_cache_destroy(mmu_page_header_cache);
+ if (nx_thread)
+ vhost_task_wake(nx_thread);
}
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
@@ -6989,7 +7193,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
}
mutex_unlock(&kvm_lock);
}
@@ -7039,9 +7243,7 @@ int kvm_mmu_vendor_module_init(void)
kvm_mmu_reset_all_pte_masks();
- pte_list_desc_cache = kmem_cache_create("pte_list_desc",
- sizeof(struct pte_list_desc),
- 0, SLAB_ACCOUNT, NULL);
+ pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
if (!pte_list_desc_cache)
goto out;
@@ -7051,23 +7253,8 @@ int kvm_mmu_vendor_module_init(void)
if (!mmu_page_header_cache)
goto out;
- if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
- goto out;
-
- mmu_shrinker = shrinker_alloc(0, "x86-mmu");
- if (!mmu_shrinker)
- goto out_shrinker;
-
- mmu_shrinker->count_objects = mmu_shrink_count;
- mmu_shrinker->scan_objects = mmu_shrink_scan;
- mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
-
- shrinker_register(mmu_shrinker);
-
return 0;
-out_shrinker:
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
out:
mmu_destroy_caches();
return ret;
@@ -7076,6 +7263,12 @@ out:
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
+ if (tdp_mmu_enabled) {
+ read_lock(&vcpu->kvm->mmu_lock);
+ mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa,
+ NULL);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ }
free_mmu_pages(&vcpu->arch.root_mmu);
free_mmu_pages(&vcpu->arch.guest_mmu);
mmu_free_memory_caches(vcpu);
@@ -7084,8 +7277,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- shrinker_free(mmu_shrinker);
}
/*
@@ -7137,7 +7328,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
mutex_unlock(&kvm_lock);
}
@@ -7240,62 +7431,68 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
-static long get_nx_huge_page_recovery_timeout(u64 start_time)
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
{
+}
+
+static bool kvm_nx_huge_page_recovery_worker(void *data)
+{
+ struct kvm *kvm = data;
bool enabled;
uint period;
+ long remaining_time;
enabled = calc_nx_huge_pages_recovery_period(&period);
+ if (!enabled)
+ return false;
- return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
+ remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
+ - get_jiffies_64();
+ if (remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ /* check for signals and come back */
+ return true;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ kvm_recover_nx_huge_pages(kvm);
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ return true;
}
-static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
{
- u64 start_time;
- long remaining_time;
+ struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct vhost_task *nx_thread;
- while (true) {
- start_time = get_jiffies_64();
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
+ kvm_nx_huge_page_recovery_worker_kill,
+ kvm, "kvm-nx-lpage-recovery");
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop() && remaining_time > 0) {
- schedule_timeout(remaining_time);
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
- set_current_state(TASK_INTERRUPTIBLE);
- }
-
- set_current_state(TASK_RUNNING);
+ if (IS_ERR(nx_thread))
+ return PTR_ERR(nx_thread);
- if (kthread_should_stop())
- return 0;
+ vhost_task_start(nx_thread);
- kvm_recover_nx_huge_pages(kvm);
- }
+ /* Make the task visible only once it is fully started. */
+ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+ return 0;
}
int kvm_mmu_post_init_vm(struct kvm *kvm)
{
- int err;
-
if (nx_hugepage_mitigation_hard_disabled)
return 0;
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
- "kvm-nx-lpage-recovery",
- &kvm->arch.nx_huge_page_recovery_thread);
- if (!err)
- kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
-
- return err;
+ return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
}
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
if (kvm->arch.nx_huge_page_recovery_thread)
- kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
+ vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
@@ -7316,6 +7513,12 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;
+ /* Unmap the old attribute page. */
+ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ range->attr_filter = KVM_FILTER_SHARED;
+ else
+ range->attr_filter = KVM_FILTER_PRIVATE;
+
return kvm_unmap_gfn_range(kvm, range);
}
@@ -7344,7 +7547,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+ return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
@@ -7388,7 +7591,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
* by the memslot, KVM can't use a hugepage due to the
* misaligned address regardless of memory attributes.
*/
- if (gfn >= slot->base_gfn) {
+ if (gfn >= slot->base_gfn &&
+ gfn + nr_pages <= slot->base_gfn + slot->npages) {
if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
hugepage_clear_mixed(slot, gfn, level);
else
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 0669a8a668ca..75f00598289d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -6,6 +6,8 @@
#include <linux/kvm_host.h>
#include <asm/kvm_host.h>
+#include "mmu.h"
+
#ifdef CONFIG_KVM_PROVE_MMU
#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
#else
@@ -101,7 +103,22 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
- unsigned int unsync_children;
+ union {
+ /* These two members aren't used for TDP MMU */
+ struct {
+ unsigned int unsync_children;
+ /*
+ * Number of writes since the last time traversal
+ * visited this page.
+ */
+ atomic_t write_flooding_count;
+ };
+ /*
+ * Page table page of external PT.
+ * Passed to TDX module, not accessed by KVM.
+ */
+ void *external_spt;
+ };
union {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
@@ -124,9 +141,6 @@ struct kvm_mmu_page {
int clear_spte_count;
#endif
- /* Number of writes since the last time traversal visited this page. */
- atomic_t write_flooding_count;
-
#ifdef CONFIG_X86_64
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
@@ -145,6 +159,34 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
return kvm_mmu_role_as_id(sp->role);
}
+static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
+{
+ return sp->role.is_mirror;
+}
+
+static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ /*
+ * external_spt is allocated for TDX module to hold private EPT mappings,
+ * TDX module will initialize the page by itself.
+ * Therefore, KVM does not need to initialize or access external_spt.
+ * KVM only interacts with sp->spt for private EPT operations.
+ */
+ sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
+}
+
+static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
+{
+ /*
+ * Since mirror SPs are used only for TDX, which maps private memory
+ * at its "natural" GFN, no mask needs to be applied to them - and, dually,
+ * we expect that the bits is only used for the shared PT.
+ */
+ if (is_mirror_sp(root))
+ return 0;
+ return kvm_gfn_direct_bits(kvm);
+}
+
static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
{
/*
@@ -164,7 +206,7 @@ static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
}
int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
- gfn_t gfn, bool can_unsync, bool prefetch);
+ gfn_t gfn, bool synchronizing, bool prefetch);
void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
@@ -190,7 +232,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
struct kvm_page_fault {
/* arguments to kvm_mmu_do_page_fault. */
const gpa_t addr;
- const u32 error_code;
+ const u64 error_code;
const bool prefetch;
/* Derived from error_code. */
@@ -229,16 +271,21 @@ struct kvm_page_fault {
*/
u8 goal_level;
- /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ /*
+ * Shifted addr, or result of guest page table walk if addr is a gva. In
+ * the case of VM where memslot's can be mapped at multiple GPA aliases
+ * (i.e. TDX), the gfn field does not contain the bit that selects between
+ * the aliases (i.e. the shared bit for TDX).
+ */
gfn_t gfn;
/* The memslot containing gfn. May be NULL. */
struct kvm_memory_slot *slot;
- /* Outputs of kvm_faultin_pfn. */
+ /* Outputs of kvm_mmu_faultin_pfn(). */
unsigned long mmu_seq;
kvm_pfn_t pfn;
- hva_t hva;
+ struct page *refcounted_page;
bool map_writable;
/*
@@ -258,6 +305,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* RET_PF_CONTINUE: So far, so good, keep handling the page fault.
* RET_PF_RETRY: let CPU fault again on the address.
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
+ * gfn and retry, or emulate the instruction directly.
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
* RET_PF_FIXED: The faulting entry has been fixed.
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
@@ -266,21 +315,37 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
*
* Note, all values must be greater than or equal to zero so as not to encroach
- * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
- * will allow for efficient machine code when checking for CONTINUE, e.g.
- * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
+ * on -errno return values.
*/
enum {
RET_PF_CONTINUE = 0,
RET_PF_RETRY,
RET_PF_EMULATE,
+ RET_PF_WRITE_PROTECTED,
RET_PF_INVALID,
RET_PF_FIXED,
RET_PF_SPURIOUS,
};
+/*
+ * Define RET_PF_CONTINUE as 0 to allow for
+ * - efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero,
+ * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value.
+ */
+static_assert(RET_PF_CONTINUE == 0);
+
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefetch, int *emulation_type)
+ u64 err, bool prefetch,
+ int *emulation_type, u8 *level)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -298,55 +363,55 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
- .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+ .is_private = err & PFERR_PRIVATE_ACCESS,
+
+ .pfn = KVM_PFN_ERR_FAULT,
};
int r;
if (vcpu->arch.mmu->root_role.direct) {
- fault.gfn = fault.addr >> PAGE_SHIFT;
+ /*
+ * Things like memslots don't understand the concept of a shared
+ * bit. Strip it so that the GFN can be used like normal, and the
+ * fault.addr can be used when the shared bit is needed.
+ */
+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
/*
- * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
- * guest perspective and have already been counted at the time of the
- * original fault.
+ * With retpoline being active an indirect call is rather expensive,
+ * so do a direct call in the most common case.
*/
- if (!prefetch)
- vcpu->stat.pf_taken++;
-
- if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+ /*
+ * Not sure what's happening, but punt to userspace and hope that
+ * they can fix it by changing memory to shared, or they can
+ * provide a better error.
+ */
+ if (r == RET_PF_EMULATE && fault.is_private) {
+ pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
+ kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ return -EFAULT;
+ }
+
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+ if (level)
+ *level = fault.goal_level;
- /*
- * Similar to above, prefetch faults aren't truly spurious, and the
- * async #PF path doesn't do emulation. Do count faults that are fixed
- * by the async #PF handler though, otherwise they'll never be counted.
- */
- if (r == RET_PF_FIXED)
- vcpu->stat.pf_fixed++;
- else if (prefetch)
- ;
- else if (r == RET_PF_EMULATE)
- vcpu->stat.pf_emulate++;
- else if (r == RET_PF_SPURIOUS)
- vcpu->stat.pf_spurious++;
return r;
}
int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn,
- int max_level);
+ const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
-void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
-
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..f35a830ce469 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -57,6 +57,7 @@
TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
TRACE_DEFINE_ENUM(RET_PF_RETRY);
TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
TRACE_DEFINE_ENUM(RET_PF_INVALID);
TRACE_DEFINE_ENUM(RET_PF_FIXED);
TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
@@ -260,7 +261,7 @@ TRACE_EVENT(
TP_STRUCT__entry(
__field(int, vcpu_id)
__field(gpa_t, cr2_or_gpa)
- __field(u32, error_code)
+ __field(u64, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
__field(u64, new_spte)
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index c87da11f3a04..561c331fd6ec 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -20,15 +20,28 @@
#include "mmu_internal.h"
#include "page_track.h"
+static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * Read external_write_tracking_enabled before related pointers. Pairs
+ * with the smp_store_release in kvm_page_track_write_tracking_enable().
+ */
+ return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
+#else
+ return false;
+#endif
+}
+
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
- return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
- !tdp_enabled || kvm_shadow_root_allocated(kvm);
+ return kvm_external_write_tracking_enabled(kvm) ||
+ kvm_shadow_root_allocated(kvm) || !tdp_enabled;
}
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
- kvfree(slot->arch.gfn_write_track);
+ vfree(slot->arch.gfn_write_track);
slot->arch.gfn_write_track = NULL;
}
@@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
return init_srcu_struct(&head->track_srcu);
}
+static int kvm_enable_external_write_tracking(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Check for *any* write tracking user (not just external users) under
+ * lock. This avoids unnecessary work, e.g. if KVM itself is using
+ * write tracking, or if two external users raced when registering.
+ */
+ if (kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Intentionally do NOT free allocations on failure to
+ * avoid having to track which allocations were made
+ * now versus when the memslot was created. The
+ * metadata is guaranteed to be freed when the slot is
+ * freed, and will be kept/used if userspace retries
+ * the failed ioctl() instead of killing the VM.
+ */
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+out_success:
+ /*
+ * Ensure that external_write_tracking_enabled becomes true strictly
+ * after all the related pointers are set.
+ */
+ smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
/*
* register the notifier so that event interception for the tracked guest
* pages can be received.
@@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
+ int r;
if (!kvm || kvm->mm != current->mm)
return -ESRCH;
+ if (!kvm_external_write_tracking_enabled(kvm)) {
+ r = kvm_enable_external_write_tracking(kvm);
+ if (r)
+ return r;
+ }
+
kvm_get_kvm(kvm);
head = &kvm->arch.track_notifier_head;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 4d4e98fe4f35..f4711674c47b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -497,21 +497,21 @@ error:
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
- EPT_VIOLATION_GVA_TRANSLATED);
+ walker->fault.exit_qualification = 0;
+
if (write_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
if (user_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
if (fetch_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
/*
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
- vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
- EPT_VIOLATION_RWX_SHIFT;
+ walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
+ EPT_VIOLATION_RWX_SHIFT;
}
#endif
walker->fault.address = addr;
@@ -533,10 +533,8 @@ static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, pt_element_t gpte)
{
- struct kvm_memory_slot *slot;
unsigned pte_access;
gfn_t gfn;
- kvm_pfn_t pfn;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
@@ -545,17 +543,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
- if (!slot)
- return false;
-
- pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
- if (is_error_pfn(pfn))
- return false;
-
- mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
- kvm_release_pfn_clean(pfn);
- return true;
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access);
}
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -646,10 +634,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* really care if it changes underneath us after this point).
*/
if (FNAME(gpte_changed)(vcpu, gw, top_level))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
/*
* Load a new root and retry the faulting instruction in the extremely
@@ -659,7 +647,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
*/
if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
- goto out_gpte_changed;
+ return RET_PF_RETRY;
}
for_each_shadow_entry(vcpu, fault->addr, it) {
@@ -674,34 +662,38 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
false, access);
- if (sp != ERR_PTR(-EEXIST)) {
- /*
- * We must synchronize the pagetable before linking it
- * because the guest doesn't need to flush tlb when
- * the gpte is changed from non-present to present.
- * Otherwise, the guest may use the wrong mapping.
- *
- * For PG_LEVEL_4K, kvm_mmu_get_page() has already
- * synchronized it transiently via kvm_sync_page().
- *
- * For higher level pagetable, we synchronize it via
- * the slower mmu_sync_children(). If it needs to
- * break, some progress has been made; return
- * RET_PF_RETRY and retry on the next #PF.
- * KVM_REQ_MMU_SYNC is not necessary but it
- * expedites the process.
- */
- if (sp->unsync_children &&
- mmu_sync_children(vcpu, sp, false))
- return RET_PF_RETRY;
- }
+ /*
+ * Synchronize the new page before linking it, as the CPU (KVM)
+ * is architecturally disallowed from inserting non-present
+ * entries into the TLB, i.e. the guest isn't required to flush
+ * the TLB when changing the gPTE from non-present to present.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already
+ * synchronized the page via kvm_sync_page().
+ *
+ * For higher level pages, which cannot be unsync themselves
+ * but can have unsync children, synchronize via the slower
+ * mmu_sync_children(). If KVM needs to drop mmu_lock due to
+ * contention or to reschedule, instruct the caller to retry
+ * the #PF (mmu_sync_children() ensures forward progress will
+ * be made).
+ */
+ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
/*
- * Verify that the gpte in the page we've just write
- * protected is still there.
+ * Verify that the gpte in the page, which is now either
+ * write-protected or unsync, wasn't modified between the fault
+ * and acquiring mmu_lock. This needs to be done even when
+ * reusing an existing shadow page to ensure the information
+ * gathered by the walker matches the information stored in the
+ * shadow page (which could have been modified by a different
+ * vCPU even if the page was already linked). Holding mmu_lock
+ * prevents the shadow page from changing after this point.
*/
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
@@ -755,9 +747,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return ret;
-
-out_gpte_changed:
- return RET_PF_RETRY;
}
/*
@@ -805,14 +794,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (page_fault_handle_page_track(vcpu, fault)) {
shadow_page_table_clear_flood(vcpu, fault->addr);
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
}
r = mmu_topup_memory_caches(vcpu, true);
if (r)
return r;
- r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
+ r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
return r;
@@ -847,8 +836,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = FNAME(fetch)(vcpu, fault, &walker);
out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -891,9 +880,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
/*
* Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
- * safe because:
- * - The spte has a reference to the struct page, so the pfn for a given gfn
- * can't change unless all sptes pointing to it are nuked first.
+ * safe because SPTEs are protected by mmu_notifiers and memslot generations, so
+ * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are
+ * nuked first.
*
* Returns
* < 0: failed to sync spte
@@ -911,7 +900,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(!sp->spt[i]))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
+ !sp->shadowed_translation))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
@@ -933,13 +923,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
return 0;
/*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
*/
- if ((!pte_access && !shadow_present_mask) ||
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]);
return 1;
@@ -961,9 +951,14 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
host_writable = spte & shadow_host_writable_mask;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
make_spte(vcpu, sp, slot, pte_access, gfn,
- spte_to_pfn(spte), spte, true, false,
+ spte_to_pfn(spte), spte, true, true,
host_writable, &spte);
+ /*
+ * There is no need to mark the pfn dirty, as the new protections must
+ * be a subset of the old protections, i.e. synchronizing a SPTE cannot
+ * change the SPTE from read-only to writable.
+ */
return mmu_spte_update(sptep, spte);
}
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4a599130e9c9..22551e2f1d00 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching;
module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
EXPORT_SYMBOL_GPL(enable_mmio_caching);
+bool __read_mostly kvm_ad_enabled;
+
u64 __read_mostly shadow_host_writable_mask;
u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
@@ -43,7 +45,25 @@ u64 __read_mostly shadow_acc_track_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-u8 __read_mostly shadow_phys_bits;
+static u8 __init kvm_get_host_maxphyaddr(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR,
+ * when reasoning about CPU behavior with respect to MAXPHYADDR.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
void __init kvm_mmu_spte_module_init(void)
{
@@ -55,6 +75,8 @@ void __init kvm_mmu_spte_module_init(void)
* will change when the vendor module is (re)loaded.
*/
allow_mmio_caching = enable_mmio_caching;
+
+ kvm_host.maxphyaddr = kvm_get_host_maxphyaddr();
}
static u64 generation_mmio_spte_mask(u64 gen)
@@ -74,10 +96,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- WARN_ON_ONCE(!shadow_mmio_value);
+ WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
access &= shadow_mmio_access_mask;
- spte |= shadow_mmio_value | access;
+ spte |= vcpu->kvm->arch.shadow_mmio_value | access;
spte |= gpa | shadow_nonpresent_or_rsvd_mask;
spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
@@ -113,12 +135,6 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
*/
bool spte_has_volatile_bits(u64 spte)
{
- /*
- * Always atomically update spte if it can be updated
- * out of mmu-lock, it can ensure dirty bit is not lost,
- * also, it can help us to get a stable is_writable_pte()
- * to ensure tlb flush is not missed.
- */
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
return true;
@@ -137,29 +153,29 @@ bool spte_has_volatile_bits(u64 spte)
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
- u64 old_spte, bool prefetch, bool can_unsync,
+ u64 old_spte, bool prefetch, bool synchronizing,
bool host_writable, u64 *new_spte)
{
int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
bool wrprot = false;
- WARN_ON_ONCE(!pte_access && !shadow_present_mask);
+ /*
+ * For the EPT case, shadow_present_mask has no RWX bits set if
+ * exec-only page table entries are supported. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY;
- /*
- * For the EPT case, shadow_present_mask is 0 if hardware
- * supports exec-only page table entries. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
spte |= shadow_present_mask;
- if (!prefetch)
- spte |= spte_shadow_accessed_mask(spte);
+ if (!prefetch || synchronizing)
+ spte |= shadow_accessed_mask;
/*
* For simplicity, enforce the NX huge page mitigation even if not
@@ -190,8 +206,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= PT_PAGE_SIZE_MASK;
if (shadow_memtype_mask)
- spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= shadow_host_writable_mask;
else
@@ -203,41 +219,39 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
- spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
-
- /*
- * Optimization: for pte sync, if spte was writable the hash
- * lookup is unnecessary (and expensive). Write protection
- * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
- * Same reasoning can be applied to dirty page accounting.
- */
- if (is_writable_pte(old_spte))
- goto out;
-
/*
* Unsync shadow pages that are reachable by the new, writable
* SPTE. Write-protect the SPTE if the page can't be unsync'd,
* e.g. it's write-tracked (upper-level SPs) or has one or more
* shadow pages and unsync'ing pages is not allowed.
+ *
+ * When overwriting an existing leaf SPTE, and the old SPTE was
+ * writable, skip trying to unsync shadow pages as any relevant
+ * shadow pages must already be unsync, i.e. the hash lookup is
+ * unnecessary (and expensive). Note, this relies on KVM not
+ * changing PFNs without first zapping the old SPTE, which is
+ * guaranteed by both the shadow MMU and the TDP MMU.
*/
- if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
+ if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) &&
+ mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch))
wrprot = true;
- pte_access &= ~ACC_WRITE_MASK;
- spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
- }
+ else
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask |
+ shadow_dirty_mask;
}
- if (pte_access & ACC_WRITE_MASK)
- spte |= spte_shadow_dirty_mask(spte);
-
-out:
- if (prefetch)
+ if (prefetch && !synchronizing)
spte = mark_spte_for_access_track(spte);
WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
"spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+ /*
+ * Mark the memslot dirty *after* modifying it for access tracking.
+ * Unlike folios, memslots can be safely marked dirty out of mmu_lock,
+ * i.e. in the fast page fault handler.
+ */
if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
/* Enforced by kvm_mmu_hugepage_adjust. */
WARN_ON_ONCE(level > PG_LEVEL_4K);
@@ -248,15 +262,15 @@ out:
return wrprot;
}
-static u64 make_spte_executable(u64 spte)
+static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
{
bool is_access_track = is_access_track_spte(spte);
if (is_access_track)
spte = restore_acc_track_spte(spte);
- spte &= ~shadow_nx_mask;
- spte |= shadow_x_mask;
+ KVM_MMU_WARN_ON(set & clear);
+ spte = (spte | set) & ~clear;
if (is_access_track)
spte = mark_spte_for_access_track(spte);
@@ -264,6 +278,16 @@ static u64 make_spte_executable(u64 spte)
return spte;
}
+static u64 make_spte_executable(u64 spte)
+{
+ return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
+}
+
+static u64 make_spte_nonexecutable(u64 spte)
+{
+ return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
+}
+
/*
* Construct an SPTE that maps a sub-page of the given huge page SPTE where
* `index` identifies which sub-page.
@@ -271,18 +295,12 @@ static u64 make_spte_executable(u64 spte)
* This is used during huge page splitting to build the SPTEs that make up the
* new page table.
*/
-u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role,
- int index)
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index)
{
- u64 child_spte;
-
- if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
- return 0;
+ u64 child_spte = huge_spte;
- if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
- return 0;
-
- child_spte = huge_spte;
+ KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm);
/*
* The child_spte already has the base address of the huge page being
@@ -306,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page
return child_spte;
}
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
+{
+ u64 huge_spte;
+
+ KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm);
+
+ huge_spte = small_spte | PT_PAGE_SIZE_MASK;
+
+ /*
+ * huge_spte already has the address of the sub-page being collapsed
+ * from small_spte, so just clear the lower address bits to create the
+ * huge page address.
+ */
+ huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
+
+ if (is_nx_huge_page_enabled(kvm))
+ huge_spte = make_spte_nonexecutable(huge_spte);
+
+ return huge_spte;
+}
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
@@ -322,22 +360,6 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
-{
- u64 new_spte;
-
- new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
- new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
- new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~shadow_host_writable_mask;
- new_spte &= ~shadow_mmu_writable_mask;
-
- new_spte = mark_spte_for_access_track(new_spte);
-
- return new_spte;
-}
-
u64 mark_spte_for_access_track(u64 spte)
{
if (spte_ad_enabled(spte))
@@ -354,7 +376,7 @@ u64 mark_spte_for_access_track(u64 spte)
spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
- spte &= ~shadow_acc_track_mask;
+ spte &= ~(shadow_acc_track_mask | shadow_accessed_mask);
return spte;
}
@@ -393,13 +415,13 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
mmio_value = 0;
/*
- * The masked MMIO value must obviously match itself and a removed SPTE
- * must not get a false positive. Removed SPTEs and MMIO SPTEs should
- * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * The masked MMIO value must obviously match itself and a frozen SPTE
+ * must not get a false positive. Frozen SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and frozen SPTEs must
* not set any RWX bits.
*/
if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
- WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value))
mmio_value = 0;
if (!mmio_value)
@@ -424,12 +446,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
{
+ kvm_ad_enabled = has_ad_bits;
+
shadow_user_mask = VMX_EPT_READABLE_MASK;
- shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
- shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+ shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
+ shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
shadow_nx_mask = 0ull;
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
- shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
+ shadow_present_mask =
+ (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
/*
* EPT overrides the host MTRRs, and so KVM must program the desired
* memtype directly into the SPTEs. Note, this mask is just the mask
@@ -446,7 +472,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
* of an EPT paging-structure entry is 110b (write/execute).
*/
kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
- VMX_EPT_RWX_MASK, 0);
+ VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
@@ -455,7 +481,7 @@ void kvm_mmu_reset_all_pte_masks(void)
u8 low_phys_bits;
u64 mask;
- shadow_phys_bits = kvm_get_shadow_phys_bits();
+ kvm_ad_enabled = true;
/*
* If the CPU has 46 or less physical address bits, then set an
@@ -508,7 +534,7 @@ void kvm_mmu_reset_all_pte_masks(void)
* 52-bit physical addresses then there are no reserved PA bits in the
* PTEs and so the reserved PA approach must be disabled.
*/
- if (shadow_phys_bits < 52)
+ if (kvm_host.maxphyaddr < 52)
mask = BIT_ULL(51) | PT_PRESENT_MASK;
else
mask = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index a129951c9a88..59746854c0af 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -3,6 +3,8 @@
#ifndef KVM_X86_MMU_SPTE_H
#define KVM_X86_MMU_SPTE_H
+#include <asm/vmx.h>
+
#include "mmu.h"
#include "mmu_internal.h"
@@ -149,6 +151,31 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+/*
+ * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress
+ * #VE and get EPT violations on non-present PTEs. We can use the
+ * same value also without TDX for both VMX and SVM:
+ *
+ * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored.
+ * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0)
+ * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1)
+ */
+#ifdef CONFIG_X86_64
+#define SHADOW_NONPRESENT_VALUE BIT_ULL(63)
+static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
+#else
+#define SHADOW_NONPRESENT_VALUE 0ULL
+#endif
+
+
+/*
+ * True if A/D bits are supported in hardware and are enabled by KVM. When
+ * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable
+ * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario
+ * where KVM is using A/D bits for L1, but not L2.
+ */
+extern bool __read_mostly kvm_ad_enabled;
+
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
@@ -184,24 +211,24 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/*
* If a thread running without exclusive control of the MMU lock must perform a
- * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
* both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
- * vulnerability. Use only low bits to avoid 64-bit immediates.
+ * vulnerability.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE 0x5a0ULL
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
-/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
-static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
+/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
-static inline bool is_removed_spte(u64 spte)
+static inline bool is_frozen_spte(u64 spte)
{
- return spte == REMOVED_SPTE;
+ return spte == FROZEN_SPTE;
}
/* Get an SPTE's index into its parent's page table (and the spt array). */
@@ -249,9 +276,14 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
-static inline bool is_mmio_spte(u64 spte)
+static inline bool is_mirror_sptep(tdp_ptep_t sptep)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
+}
+
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
+{
+ return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
likely(enable_mmio_caching);
}
@@ -260,15 +292,11 @@ static inline bool is_shadow_present_pte(u64 pte)
return !!(pte & SPTE_MMU_PRESENT_MASK);
}
-/*
- * Returns true if A/D bits are supported in hardware and are enabled by KVM.
- * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can
- * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
- * scenario where KVM is using A/D bits for L1, but not L2.
- */
-static inline bool kvm_ad_enabled(void)
+static inline bool is_ept_ve_possible(u64 spte)
{
- return !!shadow_accessed_mask;
+ return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) &&
+ !(spte & VMX_EPT_SUPPRESS_VE_BIT) &&
+ (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE;
}
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -293,18 +321,6 @@ static inline bool spte_ad_need_write_protect(u64 spte)
return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
}
-static inline u64 spte_shadow_accessed_mask(u64 spte)
-{
- KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
- return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
-}
-
-static inline u64 spte_shadow_dirty_mask(u64 spte)
-{
- KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
- return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
-}
-
static inline bool is_access_track_spte(u64 spte)
{
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
@@ -332,17 +348,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte)
static inline bool is_accessed_spte(u64 spte)
{
- u64 accessed_mask = spte_shadow_accessed_mask(spte);
-
- return accessed_mask ? spte & accessed_mask
- : !is_access_track_spte(spte);
-}
-
-static inline bool is_dirty_spte(u64 spte)
-{
- u64 dirty_mask = spte_shadow_dirty_mask(spte);
-
- return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
+ return spte & shadow_accessed_mask;
}
static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
@@ -460,6 +466,50 @@ static inline bool is_mmu_writable_spte(u64 spte)
return spte & shadow_mmu_writable_mask;
}
+/*
+ * Returns true if the access indicated by @fault is allowed by the existing
+ * SPTE protections. Note, the caller is responsible for checking that the
+ * SPTE is a shadow-present, leaf SPTE (either before or after).
+ */
+static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+{
+ if (fault->exec)
+ return is_executable_pte(spte);
+
+ if (fault->write)
+ return is_writable_pte(spte);
+
+ /* Fault was on Read access */
+ return spte & PT_PRESENT_MASK;
+}
+
+/*
+ * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
+ * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
+ * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM
+ * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
+ * not whether or not SPTEs were modified, i.e. only the write-tracking case
+ * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
+ *
+ * Don't flush if the Accessed bit is cleared, as access tracking tolerates
+ * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a
+ * mmu_notifier.clear_flush_young() event.
+ *
+ * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
+ * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
+ * when clearing dirty logs, KVM flushes based on whether or not dirty entries
+ * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
+ *
+ * Note, this logic only applies to shadow-present leaf SPTEs. The caller is
+ * responsible for checking that the old SPTE is shadow-present, and is also
+ * responsible for determining whether or not a TLB flush is required when
+ * modifying a shadow-present non-leaf SPTE.
+ */
+static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte)
+{
+ return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
+}
+
static inline u64 get_mmio_spte_generation(u64 spte)
{
u64 gen;
@@ -474,10 +524,11 @@ bool spte_has_volatile_bits(u64 spte);
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
- u64 old_spte, bool prefetch, bool can_unsync,
+ u64 old_spte, bool prefetch, bool synchronizing,
bool host_writable, u64 *new_spte);
-u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
- union kvm_mmu_page_role role, int index);
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index);
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
@@ -496,8 +547,6 @@ static inline u64 restore_acc_track_spte(u64 spte)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
-
void __init kvm_mmu_spte_module_init(void);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 04c247bfe318..9e17bfa80901 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
- SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+ SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level);
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
@@ -37,15 +37,17 @@ void tdp_iter_restart(struct tdp_iter *iter)
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn)
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits)
{
if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
- (root->role.level > PT64_ROOT_MAX_LEVEL))) {
+ (root->role.level > PT64_ROOT_MAX_LEVEL) ||
+ (gfn_bits && next_last_level_gfn >= gfn_bits))) {
iter->valid = false;
return;
}
iter->next_last_level_gfn = next_last_level_gfn;
+ iter->gfn_bits = gfn_bits;
iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
@@ -113,7 +115,7 @@ static bool try_step_side(struct tdp_iter *iter)
* Check if the iterator is already at the end of the current page
* table.
*/
- if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
+ if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) ==
(SPTE_ENT_PER_PAGE - 1))
return false;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index fae559559a80..047b78333653 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -21,11 +21,13 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
return xchg(rcu_dereference(sptep), new_spte);
}
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}
@@ -91,8 +93,10 @@ struct tdp_iter {
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
tdp_ptep_t sptep;
- /* The lowest GFN mapped by the current SPTE */
+ /* The lowest GFN (mask bits excluded) mapped by the current SPTE */
gfn_t gfn;
+ /* Mask applied to convert the GFN to the mapping GPA */
+ gfn_t gfn_bits;
/* The level of the root page given to the iterator */
int root_level;
/* The lowest level the iterator should traverse to */
@@ -120,18 +124,23 @@ struct tdp_iter {
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
- for (tdp_iter_start(&iter, root, min_level, start); \
- iter.valid && iter.gfn < end; \
+#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \
+ iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, start, end) \
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte_min_level_all(iter, root, min_level) \
+ for (tdp_iter_start(&iter, root, min_level, 0, 0); \
+ iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte(iter, kvm, root, start, end) \
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn);
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6ae19b4ee5b1..046b6ba31197 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -37,8 +37,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
* for zapping and thus puts the TDP MMU's reference to each root, i.e.
* ultimately frees all roots.
*/
- kvm_tdp_mmu_invalidate_all_roots(kvm);
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
@@ -53,6 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
+ free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -91,19 +92,33 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
+ return false;
+
+ if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
+ return false;
+
+ if (likely(!is_mirror_sp(root)))
+ return types & KVM_DIRECT_ROOTS;
+ return types & KVM_MIRROR_ROOTS;
+}
+
/*
* Returns the next root after @prev_root (or the first root if @prev_root is
- * NULL). A reference to the returned root is acquired, and the reference to
- * @prev_root is released (the caller obviously must hold a reference to
- * @prev_root if it's non-NULL).
+ * NULL) that matches with @types. A reference to the returned root is
+ * acquired, and the reference to @prev_root is released (the caller obviously
+ * must hold a reference to @prev_root if it's non-NULL).
*
- * If @only_valid is true, invalid roots are skipped.
+ * Roots that doesn't match with @types are skipped.
*
* Returns NULL if the end of tdp_mmu_roots was reached.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool only_valid)
+ enum kvm_tdp_mmu_root_types types)
{
struct kvm_mmu_page *next_root;
@@ -124,7 +139,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
typeof(*next_root), link);
while (next_root) {
- if ((!only_valid || !next_root->role.invalid) &&
+ if (tdp_mmu_root_match(next_root, types) &&
kvm_tdp_mmu_get_root(next_root))
break;
@@ -149,20 +164,20 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* If shared is set, this function is operating under the MMU lock in read
* mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
- for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
- ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _types)) \
+ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true)
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, false))
+ _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@@ -171,12 +186,16 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* Holding mmu_lock for write obviates the need for RCU protection as the list
* is guaranteed to be stable.
*/
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
- if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
- kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types)))) { \
} else
+#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
+
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
@@ -216,22 +235,44 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
{
- union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role role = mmu->root_role;
+ int as_id = kvm_mmu_role_as_id(role);
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- lockdep_assert_held_write(&kvm->mmu_lock);
+ if (mirror)
+ role.is_mirror = true;
+
+ /*
+ * Check for an existing root before acquiring the pages lock to avoid
+ * unnecessary serialization if multiple vCPUs are loading a new root.
+ * E.g. when bringing up secondary vCPUs, KVM will already have created
+ * a valid root on behalf of the primary vCPU.
+ */
+ read_lock(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+ if (root->role.word == role.word)
+ goto out_read_unlock;
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
/*
- * Check for an existing root before allocating a new one. Note, the
- * role check prevents consuming an invalid root.
+ * Recheck for an existing root after acquiring the pages lock, another
+ * vCPU may have raced ahead and created a new usable root. Manually
+ * walk the list of roots as the standard macros assume that the pages
+ * lock is *not* held. WARN if grabbing a reference to a usable root
+ * fails, as the last reference to a root can only be put *after* the
+ * root has been invalidated, which requires holding mmu_lock for write.
*/
- for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(root))
- goto out;
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
+ goto out_spin_unlock;
}
root = tdp_mmu_alloc_sp(vcpu);
@@ -245,13 +286,23 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
*/
refcount_set(&root->tdp_mmu_root_count, 2);
-
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-out:
- return __pa(root->spt);
+out_spin_unlock:
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out_read_unlock:
+ read_unlock(&kvm->mmu_lock);
+ /*
+ * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
+ * and actually consuming the root if it's invalidated after dropping
+ * mmu_lock, and the root can't be freed as this vCPU holds a reference.
+ */
+ if (mirror) {
+ mmu->mirror_root_hpa = __pa(root->spt);
+ } else {
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ }
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -289,6 +340,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
+static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ int level)
+{
+ kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
+ int ret;
+
+ /*
+ * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
+ * PTs are removed in a special order, involving free_external_spt().
+ * But remove_external_spte() will be called on non-leaf PTEs via
+ * __tdp_mmu_zap_root(), so avoid the error the former would return
+ * in this case.
+ */
+ if (!is_last_spte(old_spte, level))
+ return;
+
+ /* Zapping leaf spte is allowed only when write lock is held. */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ /* Because write lock is held, operation should success. */
+ ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
+ KVM_BUG_ON(ret, kvm);
+}
+
/**
* handle_removed_pt() - handle a page table removed from the TDP structure
*
@@ -326,14 +400,14 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
/*
* Set the SPTE to a nonpresent value that other
* threads will not overwrite. If the SPTE was
- * already marked as removed then another thread
+ * already marked as frozen then another thread
* handling a page fault could overwrite it, so
* set the SPTE until it is set from some other
- * value to the removed SPTE value.
+ * value to the frozen SPTE value.
*/
for (;;) {
- old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
+ if (!is_frozen_spte(old_spte))
break;
cpu_relax();
}
@@ -364,11 +438,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* No retry is needed in the atomic update path as the
* sole concern is dropping a Dirty bit, i.e. no other
* task can zap/remove the SPTE as mmu_lock is held for
- * write. Marking the SPTE as a removed SPTE is not
+ * write. Marking the SPTE as a frozen SPTE is not
* strictly necessary for the same reason, but using
- * the remove SPTE value keeps the shared/exclusive
+ * the frozen SPTE value keeps the shared/exclusive
* paths consistent and allows the handle_changed_spte()
- * call below to hardcode the new value to REMOVED_SPTE.
+ * call below to hardcode the new value to FROZEN_SPTE.
*
* Note, even though dropping a Dirty bit is the only
* scenario where a non-atomic update could result in a
@@ -380,15 +454,85 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* it here.
*/
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
- REMOVED_SPTE, level);
+ FROZEN_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_spte, REMOVED_SPTE, level, shared);
+ old_spte, FROZEN_SPTE, level, shared);
+
+ if (is_mirror_sp(sp)) {
+ KVM_BUG_ON(shared, kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+ }
+
+ if (is_mirror_sp(sp) &&
+ WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
+ /*
+ * Failed to free page table page in mirror page table and
+ * there is nothing to do further.
+ * Intentionally leak the page to prevent the kernel from
+ * accessing the encrypted page.
+ */
+ sp->external_spt = NULL;
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
+{
+ if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ WARN_ON_ONCE(sp->role.level + 1 != level);
+ WARN_ON_ONCE(sp->gfn != gfn);
+ return sp->external_spt;
+ }
+
+ return NULL;
+}
+
+static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
+ gfn_t gfn, u64 old_spte,
+ u64 new_spte, int level)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
+ int ret = 0;
+
+ KVM_BUG_ON(was_present, kvm);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ /*
+ * We need to lock out other updates to the SPTE until the external
+ * page table has been modified. Use FROZEN_SPTE similar to
+ * the zapping case.
+ */
+ if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
+ return -EBUSY;
+
+ /*
+ * Use different call to either set up middle level
+ * external page table, or leaf.
+ */
+ if (is_leaf) {
+ ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
+ } else {
+ void *external_spt = get_external_spt(gfn, new_spte, level);
+
+ KVM_BUG_ON(!external_spt, kvm);
+ ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
+ }
+ if (ret)
+ __kvm_tdp_mmu_write_spte(sptep, old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return ret;
+}
+
/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
@@ -457,19 +601,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (!was_present && !is_present) {
/*
- * If this change does not involve a MMIO SPTE or removed SPTE,
+ * If this change does not involve a MMIO SPTE or frozen SPTE,
* it is unexpected. Log the change, though it should not
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
- if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
- !is_mmio_spte(new_spte) &&
- !is_removed_spte(new_spte)))
+ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
+ !is_mmio_spte(kvm, new_spte) &&
+ !is_frozen_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
"are MMIO SPTEs, or the new SPTE is\n"
- "a temporary removed SPTE.\n"
+ "a temporary frozen SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
@@ -478,10 +622,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (is_leaf != was_leaf)
kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
- if (was_leaf && is_dirty_spte(old_spte) &&
- (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-
/*
* Recursively handle child PTs if the change removed a subtree from
* the paging structure. Note the WARN on the PFN changing without the
@@ -491,10 +631,50 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (was_present && !was_leaf &&
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
+}
- if (was_leaf && is_accessed_spte(old_spte) &&
- (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ /*
+ * The caller is responsible for ensuring the old SPTE is not a FROZEN
+ * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
+ */
+ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
+
+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ int ret;
+
+ /*
+ * Users of atomic zapping don't operate on mirror roots,
+ * so don't handle it and bug the VM if it's seen.
+ */
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EBUSY;
+
+ ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ if (ret)
+ return ret;
+ } else {
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
+ * and does not hold the mmu_lock. On failure, i.e. if a
+ * different logical CPU modified the SPTE, try_cmpxchg64()
+ * updates iter->old_spte with the current value, so the caller
+ * operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+ }
+
+ return 0;
}
/*
@@ -514,68 +694,24 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* no side-effects other than setting iter->old_spte to the last
* known value of the spte.
*/
-static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
- u64 *sptep = rcu_dereference(iter->sptep);
-
- /*
- * The caller is responsible for ensuring the old SPTE is not a REMOVED
- * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
- * and pre-checking before inserting a new SPTE is advantageous as it
- * avoids unnecessary work.
- */
- WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
+ int ret;
lockdep_assert_held_read(&kvm->mmu_lock);
- /*
- * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
- * does not hold the mmu_lock. On failure, i.e. if a different logical
- * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
- * the current value, so the caller operates on fresh data, e.g. if it
- * retries tdp_mmu_set_spte_atomic()
- */
- if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
- return -EBUSY;
-
- handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
-
- return 0;
-}
-
-static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
-{
- int ret;
-
- /*
- * Freeze the SPTE by setting it to a special,
- * non-present value. This will stop other threads from
- * immediately installing a present entry in its place
- * before the TLBs are flushed.
- */
- ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
if (ret)
return ret;
- kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
-
- /*
- * No other thread can overwrite the removed SPTE as they must either
- * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
- * overwrite the special removed SPTE value. No bookkeeping is needed
- * here since the SPTE is going from non-present to non-present. Use
- * the raw write helper to avoid an unnecessary check on volatile bits.
- */
- __kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
return 0;
}
-
/*
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: KVM instance
@@ -596,16 +732,26 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
/*
* No thread should be using this function to set SPTEs to or from the
- * temporary removed SPTE value.
+ * temporary frozen SPTE value.
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
* should be used. If operating under the MMU lock in write mode, the
- * use of the removed SPTE should not be necessary.
+ * use of the frozen SPTE should not be necessary.
*/
- WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
+ WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+
+ /*
+ * Users that do non-atomic setting of PTEs don't operate on mirror
+ * roots, so don't handle it and bug the VM if it's seen.
+ */
+ if (is_mirror_sptep(sptep)) {
+ KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+
return old_spte;
}
@@ -618,18 +764,28 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
iter->gfn, iter->level);
}
-#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root, _start, _end)
+#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
-#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
- tdp_root_for_each_pte(_iter, _root, _start, _end) \
+#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
+ tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
if (!is_shadow_present_pte(_iter.old_spte) || \
!is_last_spte(_iter.old_spte, _iter.level)) \
continue; \
else
-#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
+#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
+
+static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
+ struct tdp_iter *iter)
+{
+ if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
+ return false;
+
+ /* Ensure forward progress has been made before yielding. */
+ return iter->next_last_level_gfn != iter->yielded_gfn;
+}
/*
* Yield if the MMU lock is contended or this thread needs to return control
@@ -649,31 +805,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter,
bool flush, bool shared)
{
- WARN_ON_ONCE(iter->yielded);
+ KVM_MMU_WARN_ON(iter->yielded);
- /* Ensure forward progress has been made before yielding. */
- if (iter->next_last_level_gfn == iter->yielded_gfn)
+ if (!tdp_mmu_iter_need_resched(kvm, iter))
return false;
- if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-
- rcu_read_unlock();
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
- if (shared)
- cond_resched_rwlock_read(&kvm->mmu_lock);
- else
- cond_resched_rwlock_write(&kvm->mmu_lock);
+ rcu_read_unlock();
- rcu_read_lock();
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
- WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
+ rcu_read_lock();
- iter->yielded = true;
- }
+ WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
- return iter->yielded;
+ iter->yielded = true;
+ return true;
}
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
@@ -692,10 +844,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- gfn_t end = tdp_mmu_max_gfn_exclusive();
- gfn_t start = 0;
-
- for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+ for_each_tdp_pte_min_level_all(iter, root, zap_level) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -707,8 +856,8 @@ retry:
continue;
if (!shared)
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
- else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
goto retry;
}
}
@@ -734,15 +883,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
/*
- * To avoid RCU stalls due to recursively removing huge swaths of SPs,
- * split the zap into two passes. On the first pass, zap at the 1gb
- * level, and then zap top-level SPs on the second pass. "1gb" is not
- * arbitrary, as KVM must be able to zap a 1gb shadow page without
- * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ * Zap roots in multiple passes of decreasing granularity, i.e. zap at
+ * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
+ * preempt models) or mmu_lock contention (full or real-time models).
+ * Zapping at finer granularity marginally increases the total time of
+ * the zap, but in most cases the zap itself isn't latency sensitive.
*
- * Because zapping a SP recurses on its children, stepping down to
- * PG_LEVEL_4K in the iterator itself is unnecessary.
+ * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
+ * in order to mimic the page fault path, which can replace a 1GiB page
+ * table with an equivalent 1GiB hugepage, i.e. can get saddled with
+ * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
+ * allows verifying that KVM can safely zap 1GiB regions, e.g. without
+ * inducing RCU stalls, without relying on a relatively rare event
+ * (zapping roots is orders of magnitude more common). Note, because
+ * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
+ * in the iterator itself is unnecessary.
*/
+ if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
+ }
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
@@ -764,8 +924,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
+ SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
return true;
}
@@ -788,7 +948,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
@@ -799,8 +959,14 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
- flush = true;
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+
+ /*
+ * Zappings SPTEs in invalid roots doesn't require a TLB flush,
+ * see kvm_tdp_mmu_zap_invalidated_roots() for details.
+ */
+ if (!root->role.invalid)
+ flush = true;
}
rcu_read_unlock();
@@ -813,16 +979,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
}
/*
- * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
- * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
- * more SPTEs were zapped since the MMU lock was last acquired.
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
+ * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
+ * one or more SPTEs were zapped since the MMU lock was last acquired.
*/
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
@@ -833,19 +999,21 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *root;
/*
- * Zap all roots, including invalid roots, as all SPTEs must be dropped
- * before returning to the caller. Zap directly even if the root is
- * also being zapped by a worker. Walking zapped top-level SPTEs isn't
- * all that expensive and mmu_lock is already held, which means the
- * worker has yielded, i.e. flushing the work instead of zapping here
- * isn't guaranteed to be any faster.
+ * Zap all direct roots, including invalid direct roots, as all direct
+ * SPTEs must be dropped before returning to the caller. For TDX, mirror
+ * roots don't need handling in response to the mmu notifier (the caller).
+ *
+ * Zap directly even if the root is also being zapped by a concurrent
+ * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
+ * and mmu_lock is already held, which means the other thread has yielded.
*
* A TLB flush is unnecessary, KVM zaps everything if and only the VM
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
+ KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
tdp_mmu_zap_root(kvm, root, false);
}
@@ -853,11 +1021,14 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
* zap" completes.
*/
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
{
struct kvm_mmu_page *root;
- read_lock(&kvm->mmu_lock);
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root) {
if (!root->tdp_mmu_scheduled_root_to_zap)
@@ -875,7 +1046,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* that may be zapped, as such entries are associated with the
* ASID on both VMX and SVM.
*/
- tdp_mmu_zap_root(kvm, root, true);
+ tdp_mmu_zap_root(kvm, root, shared);
/*
* The referenced needs to be put *after* zapping the root, as
@@ -885,7 +1056,10 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
kvm_tdp_mmu_put_root(kvm, root);
}
- read_unlock(&kvm->mmu_lock);
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
}
/*
@@ -896,13 +1070,21 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* the VM is being destroyed).
*
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
- * See kvm_tdp_mmu_get_vcpu_root_hpa().
+ * See kvm_tdp_mmu_alloc_root().
*/
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types)
{
struct kvm_mmu_page *root;
/*
+ * Invalidating invalid roots doesn't make sense, prevent developers from
+ * having to think about it.
+ */
+ if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
+ root_types &= ~KVM_INVALID_ROOTS;
+
+ /*
* mmu_lock must be held for write to ensure that a root doesn't become
* invalid while there are active readers (invalidating a root while
* there are active readers may or may not be problematic in practice,
@@ -923,6 +1105,9 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
* or get/put references to roots.
*/
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!tdp_mmu_root_match(root, root_types))
+ continue;
+
/*
* Note, invalid roots can outlive a memslot update! Invalid
* roots must be *zapped* before the memslot update completes,
@@ -952,19 +1137,28 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
return RET_PF_RETRY;
+ if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
+ return RET_PF_SPURIOUS;
+
+ if (is_shadow_present_pte(iter->old_spte) &&
+ is_access_allowed(fault, iter->old_spte) &&
+ is_last_spte(iter->old_spte, iter->level))
+ return RET_PF_SPURIOUS;
+
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
- fault->pfn, iter->old_spte, fault->prefetch, true,
- fault->map_writable, &new_spte);
+ fault->pfn, iter->old_spte, fault->prefetch,
+ false, fault->map_writable, &new_spte);
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
- !is_last_spte(iter->old_spte, iter->level))
+ (!is_last_spte(iter->old_spte, iter->level) ||
+ WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
/*
@@ -972,13 +1166,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* protected, emulation is needed. If the emulation was skipped,
* the vCPU would have the same fault again.
*/
- if (wrprot) {
- if (fault->write)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && fault->write)
+ ret = RET_PF_WRITE_PROTECTED;
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
@@ -1006,7 +1198,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
- u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
+ u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
int ret = 0;
if (shared) {
@@ -1031,7 +1223,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
*/
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
@@ -1043,7 +1235,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
@@ -1053,7 +1245,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* If SPTE has been frozen by another thread, just give up and
* retry, avoiding unnecessary page table allocation and free.
*/
- if (is_removed_spte(iter.old_spte))
+ if (is_frozen_spte(iter.old_spte))
goto retry;
if (iter.level == fault->goal_level)
@@ -1070,13 +1262,18 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
*/
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
+ if (is_mirror_sp(sp))
+ kvm_mmu_alloc_external_spt(vcpu, sp);
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
- if (is_shadow_present_pte(iter.old_spte))
+ if (is_shadow_present_pte(iter.old_spte)) {
+ /* Don't support large page for mirrored roots (TDX) */
+ KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
- else
+ } else {
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+ }
/*
* Force the guest to retry if installing an upper level SPTE
@@ -1111,45 +1308,22 @@ retry:
return ret;
}
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
+ enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
- __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
range->may_block, flush);
return flush;
}
-typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_gfn_range *range);
-
-static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
- struct kvm_gfn_range *range,
- tdp_handler_t handler)
-{
- struct kvm_mmu_page *root;
- struct tdp_iter iter;
- bool ret = false;
-
- /*
- * Don't support rescheduling, none of the MMU notifiers that funnel
- * into this helper allow blocking; it'd be dead, wasteful code.
- */
- for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
- rcu_read_lock();
-
- tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
- ret |= handler(kvm, &iter, range);
-
- rcu_read_unlock();
- }
-
- return ret;
-}
-
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
@@ -1158,15 +1332,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
* from the clear_young() or clear_flush_young() notifier, which uses the
* return value to determine if the page has been accessed.
*/
-static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_gfn_range *range)
+static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
{
u64 new_spte;
- /* If we have a non-accessed entry we don't need to change the pte. */
- if (!is_accessed_spte(iter->old_spte))
- return false;
-
if (spte_ad_enabled(iter->old_spte)) {
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
iter->old_spte,
@@ -1174,13 +1343,6 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
iter->level);
new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(iter->old_spte))
- kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
-
new_spte = mark_spte_for_access_track(iter->old_spte);
iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
iter->old_spte, new_spte,
@@ -1189,69 +1351,52 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
iter->old_spte, new_spte);
- return true;
-}
-
-bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
-}
-
-static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_gfn_range *range)
-{
- return is_accessed_spte(iter->old_spte);
-}
-
-bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_gfn_range *range)
+static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ bool test_only)
{
- u64 new_spte;
-
- /* Huge pages aren't expected to be modified without first being zapped. */
- WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
+ enum kvm_tdp_mmu_root_types types;
+ struct kvm_mmu_page *root;
+ struct tdp_iter iter;
+ bool ret = false;
- if (iter->level != PG_LEVEL_4K ||
- !is_shadow_present_pte(iter->old_spte))
- return false;
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
/*
- * Note, when changing a read-only SPTE, it's not strictly necessary to
- * zero the SPTE before setting the new PFN, but doing so preserves the
- * invariant that the PFN of a present * leaf SPTE can never change.
- * See handle_changed_spte().
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code. Note,
+ * this helper must NOT be used to unmap GFNs, as it processes only
+ * valid roots!
*/
- tdp_mmu_iter_set_spte(kvm, iter, 0);
+ WARN_ON(types & ~KVM_VALID_ROOTS);
+ __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
+ guard(rcu)();
- if (!pte_write(range->arg.pte)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
- pte_pfn(range->arg.pte));
+ tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
+ if (!is_accessed_spte(iter.old_spte))
+ continue;
+
+ if (test_only)
+ return true;
- tdp_mmu_iter_set_spte(kvm, iter, new_spte);
+ ret = true;
+ kvm_tdp_mmu_age_spte(&iter);
+ }
}
- return true;
+ return ret;
}
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- /*
- * No need to handle the remote TLB flush under RCU protection, the
- * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
- * shadow page. See the WARN on pfn_changed in handle_changed_spte().
- */
- return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+ return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
+}
+
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
}
/*
@@ -1270,7 +1415,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1312,17 +1457,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
return spte_set;
}
-static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
{
struct kvm_mmu_page *sp;
- gfp |= __GFP_ZERO;
-
- sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
if (!sp)
return NULL;
- sp->spt = (void *)__get_free_page(gfp);
+ sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sp->spt) {
kmem_cache_free(mmu_page_header_cache, sp);
return NULL;
@@ -1331,47 +1474,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
return sp;
}
-static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
- struct tdp_iter *iter,
- bool shared)
-{
- struct kvm_mmu_page *sp;
-
- kvm_lockdep_assert_mmu_lock_held(kvm, shared);
-
- /*
- * Since we are allocating while under the MMU lock we have to be
- * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
- * reclaim and to avoid making any filesystem callbacks (which can end
- * up invoking KVM MMU notifiers, resulting in a deadlock).
- *
- * If this allocation fails we drop the lock and retry with reclaim
- * allowed.
- */
- sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
- if (sp)
- return sp;
-
- rcu_read_unlock();
-
- if (shared)
- read_unlock(&kvm->mmu_lock);
- else
- write_unlock(&kvm->mmu_lock);
-
- iter->yielded = true;
- sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
-
- if (shared)
- read_lock(&kvm->mmu_lock);
- else
- write_lock(&kvm->mmu_lock);
-
- rcu_read_lock();
-
- return sp;
-}
-
/* Note, the caller is responsible for initializing @sp. */
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
@@ -1385,7 +1487,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* not been linked in yet and thus is not reachable from any other CPU.
*/
for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
- sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
+ sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
/*
* Replace the huge spte with a pointer to the populated lower level
@@ -1418,7 +1520,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
{
struct kvm_mmu_page *sp = NULL;
struct tdp_iter iter;
- int ret = 0;
rcu_read_lock();
@@ -1433,7 +1534,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
* level above the target level (e.g. splitting a 1GB to 512 2MB pages,
* and then splitting each of those to 512 4KB pages).
*/
- for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -1442,17 +1543,31 @@ retry:
continue;
if (!sp) {
- sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ sp = tdp_mmu_alloc_sp_for_split();
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
if (!sp) {
- ret = -ENOMEM;
trace_kvm_mmu_split_huge_page(iter.gfn,
iter.old_spte,
- iter.level, ret);
- break;
+ iter.level, -ENOMEM);
+ return -ENOMEM;
}
- if (iter.yielded)
- continue;
+ rcu_read_lock();
+
+ iter.yielded = true;
+ continue;
}
tdp_mmu_init_child_sp(sp, &iter);
@@ -1473,7 +1588,7 @@ retry:
if (sp)
tdp_mmu_free_sp(sp);
- return ret;
+ return 0;
}
@@ -1498,23 +1613,26 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
}
}
-/*
- * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
- * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
- * If AD bits are not enabled, this will require clearing the writable bit on
- * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
- * be flushed.
- */
-static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end)
+static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
{
- u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
+ /*
+ * All TDP MMU shadow pages share the same role as their root, aside
+ * from level, so it is valid to key off any shadow page to determine if
+ * write protection is needed for an entire tree.
+ */
+ return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
+}
+
+static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end)
+{
+ const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
- bool spte_set = false;
rcu_read_lock();
- tdp_root_for_each_pte(iter, root, start, end) {
+ tdp_root_for_each_pte(iter, kvm, root, start, end) {
retry:
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1523,7 +1641,7 @@ retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (!(iter.old_spte & dbit))
@@ -1531,59 +1649,43 @@ retry:
if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
goto retry;
-
- spte_set = true;
}
rcu_read_unlock();
- return spte_set;
}
/*
- * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
- * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
- * If AD bits are not enabled, this will require clearing the writable bit on
- * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
- * be flushed.
+ * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
+ * memslot.
*/
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
- bool spte_set = false;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
- spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
- slot->base_gfn + slot->npages);
-
- return spte_set;
+ clear_dirty_gfn_range(kvm, root, slot->base_gfn,
+ slot->base_gfn + slot->npages);
}
-/*
- * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
- * set in mask, starting at gfn. The given memslot is expected to contain all
- * the GFNs represented by set bits in the mask. If AD bits are enabled,
- * clearing the dirty status will involve clearing the dirty bit on each SPTE
- * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
- */
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
- u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
+ tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
break;
- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (iter.level > PG_LEVEL_4K ||
@@ -1602,18 +1704,15 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
iter.old_spte,
iter.old_spte & ~dbit);
- kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
}
rcu_read_unlock();
}
/*
- * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
- * set in mask, starting at gfn. The given memslot is expected to contain all
- * the GFNs represented by set bits in the mask. If AD bits are enabled,
- * clearing the dirty status will involve clearing the dirty bit on each SPTE
- * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
+ * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
+ * which a bit is set in mask, starting at gfn. The given memslot is expected to
+ * contain all the GFNs represented by set bits in the mask.
*/
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
@@ -1622,25 +1721,59 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
-static void zap_collapsible_spte_range(struct kvm *kvm,
- struct kvm_mmu_page *root,
- const struct kvm_memory_slot *slot)
+static int tdp_mmu_make_huge_spte(struct kvm *kvm,
+ struct tdp_iter *parent,
+ u64 *huge_spte)
+{
+ struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
+ gfn_t start = parent->gfn;
+ gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
+ struct tdp_iter iter;
+
+ tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
+ /*
+ * Use the parent iterator when checking for forward progress so
+ * that KVM doesn't get stuck continuously trying to yield (i.e.
+ * returning -EAGAIN here and then failing the forward progress
+ * check in the caller ad nauseam).
+ */
+ if (tdp_mmu_iter_need_resched(kvm, parent))
+ return -EAGAIN;
+
+ *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static void recover_huge_pages_range(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
int max_mapping_level;
+ bool flush = false;
+ u64 huge_spte;
+ int r;
+
+ if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
+ return;
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+ flush = false;
continue;
+ }
if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
!is_shadow_present_pte(iter.old_spte))
@@ -1664,31 +1797,40 @@ retry:
if (iter.gfn < start || iter.gfn >= end)
continue;
- max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
- iter.gfn, PG_LEVEL_NUM);
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
if (max_mapping_level < iter.level)
continue;
- /* Note, a successful atomic zap also does a remote TLB flush. */
- if (tdp_mmu_zap_spte_atomic(kvm, &iter))
+ r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
+ if (r == -EAGAIN)
goto retry;
+ else if (r)
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
+ goto retry;
+
+ flush = true;
}
+ if (flush)
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+
rcu_read_unlock();
}
/*
- * Zap non-leaf SPTEs (and free their associated page tables) which could
- * be replaced by huge pages, for GFNs within the slot.
+ * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
+ * huge SPTEs where possible.
*/
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
- zap_collapsible_spte_range(kvm, root, slot);
+ recover_huge_pages_range(kvm, root, slot);
}
/*
@@ -1707,7 +1849,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -1740,7 +1882,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
return spte_set;
@@ -1755,14 +1897,14 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level)
{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
*root_level = vcpu->arch.mmu->root_role.level;
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
@@ -1781,15 +1923,15 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*
* WARNING: This function is only intended to be called during fast_page_fault.
*/
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
+ /* Fast pf is not supported for mirrored roots */
+ struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
- gfn_t gfn = addr >> PAGE_SHIFT;
tdp_ptep_t sptep = NULL;
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 20d97aa46c49..52acf99d40a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
@@ -19,11 +19,56 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+enum kvm_tdp_mmu_root_types {
+ KVM_INVALID_ROOTS = BIT(0),
+ KVM_DIRECT_ROOTS = BIT(1),
+ KVM_MIRROR_ROOTS = BIT(2),
+ KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS,
+ KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS,
+};
+
+static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm,
+ enum kvm_gfn_range_filter process)
+{
+ enum kvm_tdp_mmu_root_types ret = 0;
+
+ if (!kvm_has_mirrored_tdp(kvm))
+ return KVM_DIRECT_ROOTS;
+
+ if (process & KVM_FILTER_PRIVATE)
+ ret |= KVM_MIRROR_ROOTS;
+ if (process & KVM_FILTER_SHARED)
+ ret |= KVM_DIRECT_ROOTS;
+
+ WARN_ON_ONCE(!ret);
+
+ return ret;
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr)))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
+ enum kvm_tdp_mmu_root_types type)
+{
+ if (unlikely(type == KVM_MIRROR_ROOTS))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
@@ -31,18 +76,17 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush);
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot, int min_level);
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot);
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot);
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
@@ -65,7 +109,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void)
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte);
#ifdef CONFIG_X86_64