summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r--arch/x86/kvm/mmu/mmu.c9
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h5
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c30
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h4
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c77
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h24
6 files changed, 94 insertions, 55 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d75524bc8423..951dae4e7175 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5884,6 +5884,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
struct kvm_mmu_page *sp;
unsigned int ratio;
LIST_HEAD(invalid_list);
+ bool flush = false;
ulong to_zap;
rcu_idx = srcu_read_lock(&kvm->srcu);
@@ -5905,19 +5906,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
lpage_disallowed_link);
WARN_ON_ONCE(!sp->lpage_disallowed);
if (is_tdp_mmu_page(sp)) {
- kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
- sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+ flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
} else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
}
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
cond_resched_rwlock_write(&kvm->mmu_lock);
+ flush = false;
}
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ec4fc28b325a..1f6f98c76bdf 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+ return sp->role.smm ? 1 : 0;
+}
+
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
{
/*
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index e5f148106e20..b3ed302c1a35 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -21,6 +21,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
}
/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+ iter->yielded_gfn = iter->next_last_level_gfn;
+ iter->level = iter->root_level;
+
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ iter->valid = true;
+}
+
+/*
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
@@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
iter->next_last_level_gfn = next_last_level_gfn;
- iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
- iter->level = root_level;
- iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
-
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
- tdp_iter_refresh_sptep(iter);
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
+ iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
- iter->valid = true;
+ tdp_iter_restart(iter);
}
/*
@@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
}
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
-{
- return iter->pt_path[iter->root_level - 1];
-}
-
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 4cc177d75c4a..b1748b988d3a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -36,6 +36,8 @@ struct tdp_iter {
int min_level;
/* The iterator's current level within the paging structure */
int level;
+ /* The address space ID, i.e. SMM vs. regular. */
+ int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
@@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c926c6b899a1..018d82e73e31 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield);
+ gfn_t start, gfn_t end, bool can_yield, bool flush);
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
@@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
list_del(&root->link);
- zap_gfn_range(kvm, root, 0, max_gfn, false);
+ zap_gfn_range(kvm, root, 0, max_gfn, false, false);
free_page((unsigned long)root->spt);
kmem_cache_free(mmu_page_header_cache, root);
@@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
-static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
-{
- return sp->role.smm ? 1 : 0;
-}
-
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
@@ -301,11 +296,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
*
* Given a page table that has been removed from the TDP paging structure,
* iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
*/
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
bool shared)
{
- struct kvm_mmu_page *sp = sptep_to_sp(pt);
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
u64 old_child_spte;
@@ -318,7 +318,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = pt + i;
+ sptep = rcu_dereference(pt) + i;
gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
if (shared) {
@@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
cpu_relax();
}
} else {
+ /*
+ * If the SPTE is not MMU-present, there is no backing
+ * page associated with the SPTE and so no side effects
+ * that need to be recorded, and exclusive ownership of
+ * mmu_lock ensures the SPTE can't be made present.
+ * Note, zapping MMIO SPTEs is also unnecessary as they
+ * are guarded by the memslots generation, not by being
+ * unreachable.
+ */
old_child_spte = READ_ONCE(*sptep);
+ if (!is_shadow_present_pte(old_child_spte))
+ continue;
/*
* Marking the SPTE as a removed SPTE is not
@@ -481,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- u64 *root_pt = tdp_iter_root_pt(iter);
- struct kvm_mmu_page *root = sptep_to_sp(root_pt);
- int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_read(&kvm->mmu_lock);
/*
@@ -498,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
new_spte) != iter->old_spte)
return false;
- handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
- iter->level, true);
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
return true;
}
@@ -527,7 +534,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* here since the SPTE is going from non-present
* to non-present.
*/
- WRITE_ONCE(*iter->sptep, 0);
+ WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
return true;
}
@@ -553,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
- tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
- struct kvm_mmu_page *root = sptep_to_sp(root_pt);
- int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -570,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
- __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
- iter->level, false);
+ __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, false);
if (record_acc_track)
handle_changed_spte_acc_track(iter->old_spte, new_spte,
iter->level);
if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
+ handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
iter->old_spte, new_spte,
iter->level);
}
@@ -648,9 +651,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
WARN_ON(iter->gfn > iter->next_last_level_gfn);
- tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
- iter->root_level, iter->min_level,
- iter->next_last_level_gfn);
+ tdp_iter_restart(iter);
return true;
}
@@ -667,20 +668,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup.
+ * operation can cause a soft lockup. Note, in some use cases a flush may be
+ * required by prior actions. Ensure the pending flush is performed prior to
+ * yielding.
*/
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield)
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
{
struct tdp_iter iter;
- bool flush_needed = false;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
- flush_needed = false;
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+ flush = false;
continue;
}
@@ -698,11 +700,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_set_spte(kvm, &iter, 0);
- flush_needed = true;
+ flush = true;
}
rcu_read_unlock();
- return flush_needed;
+ return flush;
}
/*
@@ -711,13 +713,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
*/
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool can_yield)
{
struct kvm_mmu_page *root;
bool flush = false;
for_each_tdp_mmu_root_yield_safe(kvm, root)
- flush |= zap_gfn_range(kvm, root, start, end, true);
+ flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
return flush;
}
@@ -929,7 +932,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
struct kvm_mmu_page *root, gfn_t start,
gfn_t end, unsigned long unused)
{
- return zap_gfn_range(kvm, root, start, end, false);
+ return zap_gfn_range(kvm, root, start, end, false, false);
}
int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 3b761c111bff..31096ece9b14 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -8,7 +8,29 @@
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool can_yield);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
+ gfn_t end)
+{
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+}
+static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
+
+ /*
+ * Don't allow yielding, as the caller may have a flush pending. Note,
+ * if mmu_lock is held for write, zapping will never yield in this case,
+ * but explicitly disallow it for safety. The TDP MMU does not yield
+ * until it has made forward progress (steps sideways), and when zapping
+ * a single shadow page that it's guaranteed to see (thus the mmu_lock
+ * requirement), its "step sideways" will always step beyond the bounds
+ * of the shadow page's gfn range and stop iterating before yielding.
+ */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+}
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,