diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 111 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 28 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 99 |
5 files changed, 196 insertions, 86 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8f19ea752704..311e4e1d7870 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -473,30 +473,6 @@ retry: } #endif -static bool spte_has_volatile_bits(u64 spte) -{ - if (!is_shadow_present_pte(spte)) - return false; - - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ - if (spte_can_locklessly_be_made_writable(spte) || - is_access_track_spte(spte)) - return true; - - if (spte_ad_enabled(spte)) { - if ((spte & shadow_accessed_mask) == 0 || - (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) - return true; - } - - return false; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_can_locklessly_be_made_writable(old_spte) && + if (is_mmu_writable_spte(old_spte) && !is_writable_pte(new_spte)) flush = true; @@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; - if (!spte_has_volatile_bits(old_spte)) + if (!is_shadow_present_pte(old_spte) || + !spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); else old_spte = __update_clear_spte_slow(sptep, 0ull); @@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_can_locklessly_be_made_writable(spte))) + !(pt_protect && is_mmu_writable_spte(spte))) return false; rmap_printk("spte %p %llx\n", sptep, *sptep); @@ -2804,8 +2781,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, const struct kvm_memory_slot *slot) { unsigned long hva; - pte_t *pte; - int level; + unsigned long flags; + int level = PG_LEVEL_4K; + pgd_t pgd; + p4d_t p4d; + pud_t pud; + pmd_t pmd; if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) return PG_LEVEL_4K; @@ -2820,10 +2801,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, */ hva = __gfn_to_hva_memslot(slot, gfn); - pte = lookup_address_in_mm(kvm->mm, hva, &level); - if (unlikely(!pte)) - return PG_LEVEL_4K; + /* + * Lookup the mapping level in the current mm. The information + * may become stale soon, but it is safe to use as long as + * 1) mmu_notifier_retry was checked after taking mmu_lock, and + * 2) mmu_lock is taken now. + * + * We still need to disable IRQs to prevent concurrent tear down + * of page tables. + */ + local_irq_save(flags); + + pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); + if (pgd_none(pgd)) + goto out; + p4d = READ_ONCE(*p4d_offset(&pgd, hva)); + if (p4d_none(p4d) || !p4d_present(p4d)) + goto out; + + pud = READ_ONCE(*pud_offset(&p4d, hva)); + if (pud_none(pud) || !pud_present(pud)) + goto out; + + if (pud_large(pud)) { + level = PG_LEVEL_1G; + goto out; + } + + pmd = READ_ONCE(*pmd_offset(&pud, hva)); + if (pmd_none(pmd) || !pmd_present(pmd)) + goto out; + + if (pmd_large(pmd)) + level = PG_LEVEL_2M; + +out: + local_irq_restore(flags); return level; } @@ -2992,9 +3006,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an - * MMIO SPTE will just be an expensive nop. + * MMIO SPTE will just be an expensive nop. Do not cache MMIO + * whose gfn is greater than host.MAXPHYADDR, any guest that + * generates such gfns is running nested and is being tricked + * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if + * and only if L1's MAXPHYADDR is inaccurate with respect to + * the hardware's). */ - if (unlikely(!shadow_mmio_value)) { + if (unlikely(!shadow_mmio_value) || + unlikely(fault->gfn > kvm_mmu_max_gfn())) { *ret_val = RET_PF_EMULATE; return true; } @@ -3153,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if (fault->write && - spte_can_locklessly_be_made_writable(spte)) { + if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; /* @@ -6237,12 +6256,24 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) return 0; } -int kvm_mmu_module_init(void) +/* + * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as + * its default value of -1 is technically undefined behavior for a boolean. + */ +void kvm_mmu_x86_module_init(void) { - int ret = -ENOMEM; - if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); +} + +/* + * The bulk of the MMU initialization is deferred until the vendor module is + * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need + * to be reset when a potentially different vendor module is loaded. + */ +int kvm_mmu_vendor_module_init(void) +{ + int ret = -ENOMEM; /* * MMU roles use union aliasing which is, generally speaking, an @@ -6290,7 +6321,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) mmu_free_memory_caches(vcpu); } -void kvm_mmu_module_exit(void) +void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 4739b53c9734..e5c0b6db6f2c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } +/* + * Returns true if the SPTE has bits that may be set without holding mmu_lock. + * The caller is responsible for checking if the SPTE is shadow-present, and + * for determining whether or not the caller cares about non-leaf SPTEs. + */ +bool spte_has_volatile_bits(u64 spte) +{ + /* + * Always atomically update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) + return true; + + if (is_access_track_spte(spte)) + return true; + + if (spte_ad_enabled(spte)) { + if (!(spte & shadow_accessed_mask) || + (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) + return true; + } + + return false; +} + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 73f12615416f..80ab0f5cff01 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -extern u8 __read_mostly shadow_phys_bits; - static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && @@ -396,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte) "kvm: Writable SPTE is not MMU-writable: %llx", spte); } -static inline bool spte_can_locklessly_be_made_writable(u64 spte) +static inline bool is_mmu_writable_spte(u64 spte) { return spte & shadow_mmu_writable_mask; } @@ -410,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } +bool spte_has_volatile_bits(u64 spte); + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1eaf6ec0e0b..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -6,6 +6,7 @@ #include <linux/kvm_host.h> #include "mmu.h" +#include "spte.h" /* * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs) @@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) { return READ_ONCE(*rcu_dereference(sptep)); } -static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val) + +static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) +{ + return xchg(rcu_dereference(sptep), new_spte); +} + +static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) +{ + WRITE_ONCE(*rcu_dereference(sptep), new_spte); +} + +static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, + u64 new_spte, int level) { - WRITE_ONCE(*rcu_dereference(sptep), val); + /* + * Atomically write the SPTE if it is a shadow-present, leaf SPTE with + * volatile bits, i.e. has bits that can be set outside of mmu_lock. + * The Writable bit can be set by KVM's fast page fault handler, and + * Accessed and Dirty bits can be set by the CPU. + * + * Note, non-leaf SPTEs do have Accessed bits and those bits are + * technically volatile, but KVM doesn't consume the Accessed bit of + * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This + * logic needs to be reassessed if KVM were to use non-leaf Accessed + * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + */ + if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && + spte_has_volatile_bits(old_spte)) + return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); + + __kvm_tdp_mmu_write_spte(sptep, new_spte); + return old_spte; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d71d177ae6b8..922b06bf4b94 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -51,7 +51,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) if (!kvm->arch.tdp_mmu_enabled) return; - flush_workqueue(kvm->arch.tdp_mmu_zap_wq); + /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); @@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - u64 *sptep = rcu_dereference(pt) + i; + tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); - u64 old_child_spte; + u64 old_spte; if (shared) { /* @@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * value to the removed SPTE value. */ for (;;) { - old_child_spte = xchg(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_child_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_spte)) break; cpu_relax(); } @@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * are guarded by the memslots generation, not by being * unreachable. */ - old_child_spte = READ_ONCE(*sptep); - if (!is_shadow_present_pte(old_child_spte)) + old_spte = kvm_tdp_mmu_read_spte(sptep); + if (!is_shadow_present_pte(old_spte)) continue; /* - * Marking the SPTE as a removed SPTE is not - * strictly necessary here as the MMU lock will - * stop other threads from concurrently modifying - * this SPTE. Using the removed SPTE value keeps - * the two branches consistent and simplifies - * the function. + * Use the common helper instead of a raw WRITE_ONCE as + * the SPTE needs to be updated atomically if it can be + * modified by a different vCPU outside of mmu_lock. + * Even though the parent SPTE is !PRESENT, the TLB + * hasn't yet been flushed, and both Intel and AMD + * document that A/D assists can use upper-level PxE + * entries that are cached in the TLB, i.e. the CPU can + * still access the page and mark it dirty. + * + * No retry is needed in the atomic update path as the + * sole concern is dropping a Dirty bit, i.e. no other + * task can zap/remove the SPTE as mmu_lock is held for + * write. Marking the SPTE as a removed SPTE is not + * strictly necessary for the same reason, but using + * the remove SPTE value keeps the shared/exclusive + * paths consistent and allows the handle_changed_spte() + * call below to hardcode the new value to REMOVED_SPTE. + * + * Note, even though dropping a Dirty bit is the only + * scenario where a non-atomic update could result in a + * functional bug, simply checking the Dirty bit isn't + * sufficient as a fast page fault could read the upper + * level SPTE before it is zapped, and then make this + * target SPTE writable, resume the guest, and set the + * Dirty bit between reading the SPTE above and writing + * it here. */ - WRITE_ONCE(*sptep, REMOVED_SPTE); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, + REMOVED_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level, - shared); + old_spte, REMOVED_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, KVM_PAGES_PER_HPAGE(iter->level)); /* - * No other thread can overwrite the removed SPTE as they - * must either wait on the MMU lock or use - * tdp_mmu_set_spte_atomic which will not overwrite the - * special removed SPTE value. No bookkeeping is needed - * here since the SPTE is going from non-present - * to non-present. + * No other thread can overwrite the removed SPTE as they must either + * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not + * overwrite the special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present to non-present. Use + * the raw write helper to avoid an unnecessary check on volatile bits. */ - kvm_tdp_mmu_write_spte(iter->sptep, 0); + __kvm_tdp_mmu_write_spte(iter->sptep, 0); return 0; } @@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * unless performing certain dirty logging operations. * Leaving record_dirty_log unset in that case prevents page * writes from being double counted. + * + * Returns the old SPTE value, which _may_ be different than @old_spte if the + * SPTE had voldatile bits. */ -static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level, + bool record_acc_track, bool record_dirty_log) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, */ WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); - kvm_tdp_mmu_write_spte(sptep, new_spte); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); @@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, if (record_dirty_log) handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte, level); + return old_spte; } static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, @@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, { WARN_ON_ONCE(iter->yielded); - __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, - new_spte, iter->gfn, iter->level, - record_acc_track, record_dirty_log); + iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level, + record_acc_track, record_dirty_log); } static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, @@ -815,14 +839,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, return iter->yielded; } -static inline gfn_t tdp_mmu_max_gfn_host(void) +static inline gfn_t tdp_mmu_max_gfn_exclusive(void) { /* - * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that - * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF, - * and so KVM will never install a SPTE for such addresses. + * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with + * a gpa range that would exceed the max gfn, and KVM does not create + * MMIO SPTEs for "impossible" gfns, instead sending such accesses down + * the slow emulation path every time. */ - return 1ULL << (shadow_phys_bits - PAGE_SHIFT); + return kvm_mmu_max_gfn() + 1; } static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, @@ -830,7 +855,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; - gfn_t end = tdp_mmu_max_gfn_host(); + gfn_t end = tdp_mmu_max_gfn_exclusive(); gfn_t start = 0; for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { @@ -923,7 +948,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; - end = min(end, tdp_mmu_max_gfn_host()); + end = min(end, tdp_mmu_max_gfn_exclusive()); lockdep_assert_held_write(&kvm->mmu_lock); |