diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 894 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 15 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmutrace.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 68 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 22 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 32 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 43 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 35 |
9 files changed, 634 insertions, 479 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 70979358a0ad..845d114ae075 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -55,7 +55,7 @@ extern bool itlb_multihit_kvm_mitigation; -static int __read_mostly nx_huge_pages = -1; +int __read_mostly nx_huge_pages = -1; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -176,9 +176,80 @@ static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); +struct kvm_mmu_role_regs { + const unsigned long cr0; + const unsigned long cr4; + const u64 efer; +}; + #define CREATE_TRACE_POINTS #include "mmutrace.h" +/* + * Yes, lot's of underscores. They're a hint that you probably shouldn't be + * reading from the role_regs. Once the mmu_role is constructed, it becomes + * the single source of truth for the MMU's state. + */ +#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ +static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +{ \ + return !!(regs->reg & flag); \ +} +BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); +BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); +BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); + +/* + * The MMU itself (with a valid role) is the single source of truth for the + * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The + * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, + * and the vCPU may be incorrect/irrelevant. + */ +#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ +static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +{ \ + return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ +} +BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg); +BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); +BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); + +static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_role_regs regs = { + .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), + .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), + .efer = vcpu->arch.efer, + }; + + return regs; +} + +static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs) +{ + if (!____is_cr0_pg(regs)) + return 0; + else if (____is_efer_lma(regs)) + return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : + PT64_ROOT_4LEVEL; + else if (____is_cr4_pae(regs)) + return PT32E_ROOT_LEVEL; + else + return PT32_ROOT_LEVEL; +} static inline bool kvm_available_flush_tlb_with_range(void) { @@ -208,11 +279,6 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { @@ -269,11 +335,6 @@ static int is_cpuid_PSE36(void) return 1; } -static int is_nx(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.efer & EFER_NX; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -1177,8 +1238,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, * @gfn_offset: start of the BITS_PER_LONG pages we care about * @mask: indicates which pages we should protect * - * Used when we do not need to care about huge page mappings: e.g. during dirty - * logging we do not have any such mappings. + * Used when we do not need to care about huge page mappings. */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -1189,6 +1249,10 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1218,6 +1282,10 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1235,13 +1303,36 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to * enable dirty logging for them. * - * Used when we do not need to care about huge page mappings: e.g. during dirty - * logging we do not have any such mappings. + * We need to care about huge page mappings: e.g. during dirty logging we may + * have such mappings. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { + /* + * Huge pages are NOT write protected when we start dirty logging in + * initially-all-set mode; must write protect them here so that they + * are split to 4K on the first write. + * + * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn + * of memslot has no such restriction, so the range can cross two large + * pages. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { + gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); + gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); + + kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); + + /* Cross two large pages? */ + if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != + ALIGN(end << PAGE_SHIFT, PMD_SIZE)) + kvm_mmu_slot_gfn_write_protect(kvm, slot, end, + PG_LEVEL_2M); + } + + /* Now handle 4K PTEs. */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else @@ -1254,20 +1345,23 @@ int kvm_cpu_dirty_log_size(void) } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn) + struct kvm_memory_slot *slot, u64 gfn, + int min_level) { struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; - for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + if (kvm_memslots_have_rmaps(kvm)) { + for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); + } } if (is_tdp_mmu_enabled(kvm)) write_protected |= - kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); return write_protected; } @@ -1277,7 +1371,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); + return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1433,9 +1527,10 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1445,9 +1540,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1500,9 +1596,10 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1512,9 +1609,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -1748,17 +1846,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else -static inline bool is_ept_sp(struct kvm_mmu_page *sp) -{ - return sp->role.cr0_wp && sp->role.smap_andnot_wp; -} - -/* @sp->gfn should be write-protected at the call site */ -static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { - if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) || - vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { + if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -1804,31 +1895,6 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } -static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) -{ - kvm_unlink_unsync_page(vcpu->kvm, sp); - return __kvm_sync_page(vcpu, sp, invalid_list); -} - -/* @gfn should be write-protected at the call site */ -static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *s; - bool ret = false; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { - if (!s->unsync) - continue; - - WARN_ON(s->role.level != PG_LEVEL_4K); - ret |= kvm_sync_page(vcpu, s, invalid_list); - } - - return ret; -} - struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; unsigned int idx[PT64_ROOT_MAX_LEVEL]; @@ -1923,6 +1989,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } for_each_sp(pages, sp, parents, i) { + kvm_unlink_unsync_page(vcpu->kvm, sp); flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } @@ -1958,8 +2025,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; - bool need_sync = false; - bool flush = false; int collisions = 0; LIST_HEAD(invalid_list); @@ -1982,20 +2047,39 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, continue; } - if (!need_sync && sp->unsync) - need_sync = true; - - if (sp->role.word != role.word) + if (sp->role.word != role.word) { + /* + * If the guest is creating an upper-level page, zap + * unsync pages for the same gfn. While it's possible + * the guest is using recursive page tables, in all + * likelihood the guest has stopped using the unsync + * page and is installing a completely unrelated page. + * Unsync pages must not be left as is, because the new + * upper-level page will be write-protected. + */ + if (level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); continue; + } if (direct_mmu) goto trace_get_page; if (sp->unsync) { - /* The page is good, but __kvm_sync_page might still end - * up zapping it. If so, break in order to rebuild it. + /* + * The page is good, but is stale. kvm_sync_page does + * get the latest guest state, but (unlike mmu_unsync_children) + * it doesn't write-protect the page or mark it synchronized! + * This way the validity of the mapping is ensured, but the + * overhead of write protection is not incurred until the + * guest invalidates the TLB mapping. This allows multiple + * SPs for a single gfn to be unsync. + * + * If the sync fails, the page is zapped. If so, break + * in order to rebuild it. */ - if (!__kvm_sync_page(vcpu, sp, &invalid_list)) + if (!kvm_sync_page(vcpu, sp, &invalid_list)) break; WARN_ON(!list_empty(&invalid_list)); @@ -2020,22 +2104,14 @@ trace_get_page: sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (!direct) { - /* - * we should do write protection before syncing pages - * otherwise the content of the synced shadow page may - * be inconsistent with guest page table. - */ account_shadowed(vcpu->kvm, sp); if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); - - if (level > PG_LEVEL_4K && need_sync) - flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } trace_kvm_mmu_get_page(sp, true); - - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); out: + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; @@ -2448,17 +2524,33 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +/* + * Attempt to unsync any shadow pages that can be reached by the specified gfn, + * KVM is creating a writable mapping for said gfn. Returns 0 if all pages + * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must + * be write-protected. + */ +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + /* + * Force write-protection if the page is being tracked. Note, the page + * track machinery is used to write-protect upper-level shadow pages, + * i.e. this guards the role.level == 4K assertion below! + */ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) - return true; + return -EPERM; + /* + * The page is not write-tracked, mark existing shadow pages unsync + * unless KVM is synchronizing an unsync SP (can_unsync = false). In + * that case, KVM must complete emulation of the guest TLB flush before + * allowing shadow pages to become unsync (writable by the guest). + */ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) - return true; + return -EPERM; if (sp->unsync) continue; @@ -2489,8 +2581,8 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, * 2.2 Guest issues TLB flush. * That causes a VM Exit. * - * 2.3 kvm_mmu_sync_pages() reads sp->unsync. - * Since it is false, so it just returns. + * 2.3 Walking of unsync pages sees sp->unsync is + * false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, @@ -2506,7 +2598,7 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, */ smp_wmb(); - return false; + return 0; } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2827,9 +2919,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); @@ -3180,6 +3269,33 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); +void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + unsigned long roots_to_free = 0; + hpa_t root_hpa; + int i; + + /* + * This should not be called while L2 is active, L2 can't invalidate + * _only_ its own roots, e.g. INVVPID unconditionally exits. + */ + WARN_ON_ONCE(mmu->mmu_role.base.guest_mode); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + root_hpa = mmu->prev_roots[i].hpa; + if (!VALID_PAGE(root_hpa)) + continue; + + if (!to_shadow_page(root_hpa) || + to_shadow_page(root_hpa)->role.guest_mode) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); +} +EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); + + static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; @@ -3280,6 +3396,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } + r = alloc_all_memslots_rmaps(vcpu->kvm); + if (r) + return r; + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) @@ -3423,8 +3543,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) * flush strictly after those changes are made. We only need to * ensure that the other CPU sets these flags before any actual * changes to the page tables are made. The comments in - * mmu_need_write_protect() describe what could go wrong if this - * requirement isn't satisfied. + * mmu_try_to_unsync_pages() describe what could go wrong if + * this requirement isn't satisfied. */ if (!smp_load_acquire(&sp->unsync) && !smp_load_acquire(&sp->unsync_children)) @@ -3474,19 +3594,6 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } -static bool -__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) -{ - int bit7 = (pte >> 7) & 1; - - return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; -} - -static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) -{ - return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); -} - static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { /* @@ -3540,12 +3647,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { - *sptep = 0ull; - return reserved; - } - - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -3569,13 +3671,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the - * reserved bit and EPT's invalid memtype/XWR checks to avoid - * adding a Jcc in the loop. - */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) | - __is_rsvd_bits_set(rsvd_check, sptes[level], level); + reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", @@ -3583,7 +3679,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) for (level = root; level >= leaf; level--) pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", sptes[level], level, - rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]); + get_rsvd_bits(rsvd_check, sptes[level], level)); } return reserved; @@ -3717,6 +3813,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { + bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); bool write = error_code & PFERR_WRITE_MASK; bool map_writable; @@ -3729,7 +3826,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + if (!is_tdp_mmu_fault) { r = fast_page_fault(vcpu, gpa, error_code); if (r != RET_PF_INVALID) return r; @@ -3751,7 +3848,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = RET_PF_RETRY; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_lock(&vcpu->kvm->mmu_lock); else write_lock(&vcpu->kvm->mmu_lock); @@ -3762,7 +3859,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (r) goto out_unlock; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, pfn, prefault); else @@ -3770,7 +3867,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); @@ -3840,17 +3937,13 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, max_level, true); } -static void nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->root_level = 0; - context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = true; - context->nx = false; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, @@ -3913,8 +4006,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, } static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, - union kvm_mmu_page_role new_role, - bool skip_tlb_flush, bool skip_mmu_sync) + union kvm_mmu_page_role new_role) { if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); @@ -3929,10 +4021,10 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - if (!skip_mmu_sync || force_flush_and_sync_on_reuse) + if (force_flush_and_sync_on_reuse) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - if (!skip_tlb_flush || force_flush_and_sync_on_reuse) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } /* * The last MMIO access's GVA and GPA are cached in the VCPU. When @@ -3951,11 +4043,9 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, to_shadow_page(vcpu->arch.mmu->root_hpa)); } -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, - bool skip_mmu_sync) +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { - __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu), - skip_tlb_flush, skip_mmu_sync); + __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu)); } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); @@ -3981,26 +4071,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return false; } -static inline bool is_last_gpte(struct kvm_mmu *mmu, - unsigned level, unsigned gpte) -{ - /* - * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. - * If it is clear, there are no large pages at this level, so clear - * PT_PAGE_SIZE_MASK in gpte if that is the case. - */ - gpte &= level - mmu->last_nonleaf_level; - - /* - * PG_LEVEL_4K always terminates. The RHS has bit 7 set - * iff level <= PG_LEVEL_4K, which for our purpose means - * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PG_LEVEL_4K - 1; - - return gpte & PT_PAGE_SIZE_MASK; -} - #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" @@ -4015,8 +4085,7 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, #undef PTTYPE static void -__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct rsvd_bits_validate *rsvd_check, +__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { @@ -4105,14 +4174,29 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } +static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) +{ + /* + * If TDP is enabled, let the guest use GBPAGES if they're supported in + * hardware. The hardware page walker doesn't let KVM disable GBPAGES, + * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA + * walk for performance and complexity reasons. Not to mention KVM + * _can't_ solve the problem because GVA->GPA walks aren't visible to + * KVM once a TDP translation is installed. Mimic hardware behavior so + * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. + */ + return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); +} + static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, + __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, - context->root_level, context->nx, - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), + context->root_level, is_efer_nx(context), + guest_can_use_gbpages(vcpu), + is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } @@ -4165,24 +4249,32 @@ static inline u64 reserved_hpa_bits(void) * table in guest or amd nested guest, its mmu features completely * follow the features in guest. */ -void -reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - bool uses_nx = context->nx || - context->mmu_role.base.smep_andnot_wp; + /* + * KVM uses NX when TDP is disabled to handle a variety of scenarios, + * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and + * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. + * The iTLB multi-hit workaround can be toggled at any time, so assume + * NX can be used by any non-nested shadow MMU to avoid having to reset + * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. + */ + bool uses_nx = is_efer_nx(context) || !tdp_enabled; + + /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ + bool is_amd = true; + /* KVM doesn't use 2-level page tables for the shadow MMU. */ + bool is_pse = false; struct rsvd_bits_validate *shadow_zero_check; int i; - /* - * Passing "true" to the last argument is okay; it adds a check - * on bit 8 of the SPTEs which KVM doesn't use anyway. - */ + WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); + shadow_zero_check = &context->shadow_zero_check; - __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - reserved_hpa_bits(), + __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, uses_nx, - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), true); + guest_can_use_gbpages(vcpu), is_pse, is_amd); if (!shadow_me_mask) return; @@ -4193,7 +4285,6 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } } -EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); static inline bool boot_cpu_is_amd(void) { @@ -4215,11 +4306,10 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, shadow_zero_check = &context->shadow_zero_check; if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - reserved_hpa_bits(), + __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), - true, true); + false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, reserved_hpa_bits(), false); @@ -4255,8 +4345,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, (7 & (access) ? 128 : 0)) -static void update_permission_bitmask(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) { unsigned byte; @@ -4264,9 +4353,10 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, const u8 w = BYTE_MASK(ACC_WRITE_MASK); const u8 u = BYTE_MASK(ACC_USER_MASK); - bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0; - bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0; - bool cr0_wp = is_write_protection(vcpu); + bool cr4_smep = is_cr4_smep(mmu); + bool cr4_smap = is_cr4_smap(mmu); + bool cr0_wp = is_cr0_wp(mmu); + bool efer_nx = is_efer_nx(mmu); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { unsigned pfec = byte << 1; @@ -4292,7 +4382,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; /* Not really needed: !nx will cause pte.nx to fault */ - if (!mmu->nx) + if (!efer_nx) ff = 0; /* Allow supervisor writes if !cr0.wp */ @@ -4351,24 +4441,17 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away. */ -static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept) +static void update_pkru_bitmask(struct kvm_mmu *mmu) { unsigned bit; bool wp; - if (ept) { - mmu->pkru_mask = 0; - return; - } - - /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { + if (!is_cr4_pke(mmu)) { mmu->pkru_mask = 0; return; } - wp = is_write_protection(vcpu); + wp = is_cr0_wp(mmu); for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { unsigned pfec, pkey_bits; @@ -4402,81 +4485,51 @@ static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } } -static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) { - unsigned root_level = mmu->root_level; + if (!is_cr0_pg(mmu)) + return; - mmu->last_nonleaf_level = root_level; - if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu)) - mmu->last_nonleaf_level++; + reset_rsvds_bits_mask(vcpu, mmu); + update_permission_bitmask(mmu, false); + update_pkru_bitmask(mmu); } -static void paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static void paging64_init_context(struct kvm_mmu *context) { - context->nx = is_nx(vcpu); - context->root_level = level; - - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); - - MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; - context->shadow_root_level = level; context->direct_map = false; } -static void paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - int root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; - - paging64_init_context_common(vcpu, context, root_level); -} - -static void paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32_init_context(struct kvm_mmu *context) { - context->nx = false; - context->root_level = PT32_ROOT_LEVEL; - - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); - context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; - context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = false; } -static void paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); -} - -static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) +static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { union kvm_mmu_extended_role ext = {0}; - ext.cr0_pg = !!is_paging(vcpu); - ext.cr4_pae = !!is_pae(vcpu); - ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); - ext.cr4_pse = !!is_pse(vcpu); - ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); - ext.maxphyaddr = cpuid_maxphyaddr(vcpu); + if (____is_cr0_pg(regs)) { + ext.cr0_pg = 1; + ext.cr4_pae = ____is_cr4_pae(regs); + ext.cr4_smep = ____is_cr4_smep(regs); + ext.cr4_smap = ____is_cr4_smap(regs); + ext.cr4_pse = ____is_cr4_pse(regs); + + /* PKEY and LA57 are active iff long mode is active. */ + ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); + ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); + } ext.valid = 1; @@ -4484,20 +4537,23 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) } static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { union kvm_mmu_role role = {0}; role.base.access = ACC_ALL; - role.base.nxe = !!is_nx(vcpu); - role.base.cr0_wp = is_write_protection(vcpu); + if (____is_cr0_pg(regs)) { + role.base.efer_nx = ____is_efer_nx(regs); + role.base.cr0_wp = ____is_cr0_wp(regs); + } role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); if (base_only) return role; - role.ext = kvm_calc_mmu_role_ext(vcpu); + role.ext = kvm_calc_mmu_role_ext(vcpu, regs); return role; } @@ -4512,9 +4568,10 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.level = kvm_mmu_get_tdp_level(vcpu); @@ -4527,8 +4584,9 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.root_mmu; + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_mmu_role new_role = - kvm_calc_tdp_mmu_root_page_role(vcpu, false); + kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, false); if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -4542,60 +4600,44 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; + context->root_level = role_regs_to_root_level(®s); - if (!is_paging(vcpu)) { - context->nx = false; + if (!is_cr0_pg(context)) context->gva_to_gpa = nonpaging_gva_to_gpa; - context->root_level = 0; - } else if (is_long_mode(vcpu)) { - context->nx = is_nx(vcpu); - context->root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; - reset_rsvds_bits_mask(vcpu, context); - context->gva_to_gpa = paging64_gva_to_gpa; - } else if (is_pae(vcpu)) { - context->nx = is_nx(vcpu); - context->root_level = PT32E_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); + else if (is_cr4_pae(context)) context->gva_to_gpa = paging64_gva_to_gpa; - } else { - context->nx = false; - context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); + else context->gva_to_gpa = paging32_gva_to_gpa; - } - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); + reset_guest_paging_metadata(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } static union kvm_mmu_role -kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); - role.base.smep_andnot_wp = role.ext.cr4_smep && - !is_write_protection(vcpu); - role.base.smap_andnot_wp = role.ext.cr4_smap && - !is_write_protection(vcpu); - role.base.gpte_is_8_bytes = !!is_pae(vcpu); + role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); + role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); + role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); return role; } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, base_only); + kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only); - role.base.direct = !is_paging(vcpu); + role.base.direct = !____is_cr0_pg(regs); - if (!is_long_mode(vcpu)) + if (!____is_efer_lma(regs)) role.base.level = PT32E_ROOT_LEVEL; - else if (is_la57_mode(vcpu)) + else if (____is_cr4_la57(regs)) role.base.level = PT64_ROOT_5LEVEL; else role.base.level = PT64_ROOT_4LEVEL; @@ -4604,37 +4646,44 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - u32 cr0, u32 cr4, u32 efer, + struct kvm_mmu_role_regs *regs, union kvm_mmu_role new_role) { - if (!(cr0 & X86_CR0_PG)) - nonpaging_init_context(vcpu, context); - else if (efer & EFER_LMA) - paging64_init_context(vcpu, context); - else if (cr4 & X86_CR4_PAE) - paging32E_init_context(vcpu, context); - else - paging32_init_context(vcpu, context); + if (new_role.as_u64 == context->mmu_role.as_u64) + return; context->mmu_role.as_u64 = new_role.as_u64; + + if (!is_cr0_pg(context)) + nonpaging_init_context(context); + else if (is_cr4_pae(context)) + paging64_init_context(context); + else + paging32_init_context(context); + context->root_level = role_regs_to_root_level(regs); + + reset_guest_paging_metadata(vcpu, context); + context->shadow_root_level = new_role.base.level; + reset_shadow_zero_bits_mask(vcpu, context); } -static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); + kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); - if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + shadow_mmu_init_context(vcpu, context, regs, new_role); } static union kvm_mmu_role -kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, false); + kvm_calc_shadow_root_page_role_common(vcpu, regs, false); role.base.direct = false; role.base.level = kvm_mmu_get_tdp_level(vcpu); @@ -4642,23 +4691,22 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) return role; } -void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, - gpa_t nested_cr3) +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, + unsigned long cr4, u64 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; - union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + struct kvm_mmu_role_regs regs = { + .cr0 = cr0, + .cr4 = cr4, + .efer = efer, + }; + union kvm_mmu_role new_role; - __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); - if (new_role.as_u64 != context->mmu_role.as_u64) { - shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); - /* - * Override the level set by the common init helper, nested TDP - * always uses the host's TDP configuration. - */ - context->shadow_root_level = new_role.base.level; - } + shadow_mmu_init_context(vcpu, context, ®s, new_role); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); @@ -4678,15 +4726,10 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.guest_mode = true; role.base.access = ACC_ALL; - /* - * WP=1 and NOT_WP=1 is an impossible combination, use WP and the - * SMAP variation to denote shadow EPT entries. - */ - role.base.cr0_wp = true; - role.base.smap_andnot_wp = true; - - role.ext = kvm_calc_mmu_role_ext(vcpu); + /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ + role.ext.word = 0; role.ext.execonly = execonly; + role.ext.valid = 1; return role; } @@ -4700,14 +4743,15 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true); + __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base); if (new_role.as_u64 == context->mmu_role.as_u64) return; + context->mmu_role.as_u64 = new_role.as_u64; + context->shadow_root_level = level; - context->nx = true; context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; @@ -4715,11 +4759,9 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->invlpg = ept_invlpg; context->root_level = level; context->direct_map = false; - context->mmu_role.as_u64 = new_role.as_u64; - update_permission_bitmask(vcpu, context, true); - update_pkru_bitmask(vcpu, context, true); - update_last_nonleaf_level(vcpu, context); + update_permission_bitmask(context, true); + update_pkru_bitmask(context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4728,20 +4770,21 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.root_mmu; + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); - kvm_init_shadow_mmu(vcpu, - kvm_read_cr0_bits(vcpu, X86_CR0_PG), - kvm_read_cr4_bits(vcpu, X86_CR4_PAE), - vcpu->arch.efer); + kvm_init_shadow_mmu(vcpu, ®s); context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } -static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) +static union kvm_mmu_role +kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) { - union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false); + union kvm_mmu_role role; + + role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false); /* * Nested MMUs are used only for walking L2's gva->gpa, they never have @@ -4749,23 +4792,14 @@ static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) * to "true" to try to detect bogus usage of the nested MMU. */ role.base.direct = true; - - if (!is_paging(vcpu)) - role.base.level = 0; - else if (is_long_mode(vcpu)) - role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : - PT64_ROOT_4LEVEL; - else if (is_pae(vcpu)) - role.base.level = PT32E_ROOT_LEVEL; - else - role.base.level = PT32_ROOT_LEVEL; - + role.base.level = role_regs_to_root_level(regs); return role; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { - union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu); + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); + union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, ®s); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_role.as_u64 == g_context->mmu_role.as_u64) @@ -4775,6 +4809,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->get_guest_pgd = get_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; + g_context->root_level = new_role.base.level; /* * L2 page tables are never shadowed, so there is no need to sync @@ -4790,44 +4825,20 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * nested page tables as the second level of translation. Basically * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ - if (!is_paging(vcpu)) { - g_context->nx = false; - g_context->root_level = 0; + if (!is_paging(vcpu)) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; - } else if (is_long_mode(vcpu)) { - g_context->nx = is_nx(vcpu); - g_context->root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); + else if (is_long_mode(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else if (is_pae(vcpu)) { - g_context->nx = is_nx(vcpu); - g_context->root_level = PT32E_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); + else if (is_pae(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else { - g_context->nx = false; - g_context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); + else g_context->gva_to_gpa = paging32_gva_to_gpa_nested; - } - update_permission_bitmask(vcpu, g_context, false); - update_pkru_bitmask(vcpu, g_context, false); - update_last_nonleaf_level(vcpu, g_context); + reset_guest_paging_metadata(vcpu, g_context); } -void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) +void kvm_init_mmu(struct kvm_vcpu *vcpu) { - if (reset_roots) { - uint i; - - vcpu->arch.mmu->root_hpa = INVALID_PAGE; - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - } - if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -4840,20 +4851,53 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) { + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_mmu_role role; if (tdp_enabled) - role = kvm_calc_tdp_mmu_root_page_role(vcpu, true); + role = kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, true); else - role = kvm_calc_shadow_mmu_root_page_role(vcpu, true); + role = kvm_calc_shadow_mmu_root_page_role(vcpu, ®s, true); return role.base; } +void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + /* + * Invalidate all MMU roles to force them to reinitialize as CPUID + * information is factored into reserved bit calculations. + */ + vcpu->arch.root_mmu.mmu_role.ext.valid = 0; + vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; + vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; + kvm_mmu_reset_context(vcpu); + + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise + * sweep the problem under the rug. + * + * KVM's horrific CPUID ABI makes the problem all but impossible to + * solve, as correctly handling multiple vCPU models (with respect to + * paging and physical address properties) in a single VM would require + * tracking all relevant CPUID information in kvm_mmu_page_role. That + * is very undesirable as it would double the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments), and in practice + * no sane VMM mucks with the core vCPU model on the fly. + */ + if (vcpu->arch.last_vmentry_cpu != -1) { + pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); + pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); + } +} + void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - kvm_init_mmu(vcpu, true); + kvm_init_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -5491,7 +5535,13 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - kvm_mmu_init_tdp_mmu(kvm); + if (!kvm_mmu_init_tdp_mmu(kvm)) + /* + * No smp_load/store wrappers needed here as we are in + * VM init and there cannot be any memslots / other threads + * accessing this struct kvm yet. + */ + kvm->arch.memslots_have_rmaps = true; node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -5514,29 +5564,29 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; - write_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = slot_handle_level_range(kvm, memslot, + kvm_zap_rmapp, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, start, + end - 1, true, flush); + } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + write_unlock(&kvm->mmu_lock); } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - - write_unlock(&kvm->mmu_lock); - if (is_tdp_mmu_enabled(kvm)) { flush = false; @@ -5563,12 +5613,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot, int start_level) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5636,18 +5689,17 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; - bool flush; - - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + bool flush = false; - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - read_lock(&kvm->mmu_lock); flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); if (flush) @@ -5674,11 +5726,14 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5981,6 +6036,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel static void kvm_recover_nx_lpages(struct kvm *kvm) { + unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -5992,7 +6048,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) break; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index d64ccb417c60..35567293c1fd 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -116,14 +116,19 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) kvm_x86_ops.cpu_dirty_log_size; } -bool is_nx_huge_page_enabled(void); -bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync); +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn); + struct kvm_memory_slot *slot, u64 gfn, + int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); @@ -158,8 +163,6 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, kvm_pfn_t *pfnp, int *goal_levelp); -bool is_nx_huge_page_enabled(void); - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index e798489b56b5..efbad33a0645 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -40,7 +40,7 @@ role.direct ? " direct" : "", \ access_str[role.access], \ role.invalid ? " invalid" : "", \ - role.nxe ? "" : "!", \ + role.efer_nx ? "" : "!", \ role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 34bb0ec69bd8..91a9f7e0fd91 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -100,7 +100,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, kvm_mmu_gfn_disallow_lpage(slot, gfn); if (mode == KVM_PAGE_TRACK_WRITE) - if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) kvm_flush_remote_tlbs(kvm); } EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 823a5919f9fa..490a028ddabe 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) return pkeys; } +static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, + unsigned int level, unsigned int gpte) +{ + /* + * For EPT and PAE paging (both variants), bit 7 is either reserved at + * all level or indicates a huge page (ignoring CR3/EPTP). In either + * case, bit 7 being set terminates the walk. + */ +#if PTTYPE == 32 + /* + * 32-bit paging requires special handling because bit 7 is ignored if + * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is + * greater than the last level for which bit 7 is the PAGE_SIZE bit. + * + * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 + * is not reserved and does not indicate a large page at this level, + * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. + */ + gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse); +#endif + /* + * PG_LEVEL_4K always terminates. The RHS has bit 7 set + * iff level <= PG_LEVEL_4K, which for our purpose means + * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PG_LEVEL_4K - 1; + + return gpte & PT_PAGE_SIZE_MASK; +} /* * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ @@ -421,7 +450,7 @@ retry_walk: /* Convert to ACC_*_MASK flags for struct guest_walker. */ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); - } while (!is_last_gpte(mmu, walker->level, pte)); + } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; @@ -471,8 +500,7 @@ retry_walk: error: errcode |= write_fault | user_fault; - if (fetch_fault && (mmu->nx || - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) + if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; @@ -767,7 +795,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, bool self_changed = false; if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_write_protection(vcpu) && !user_fault))) + (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) return false; for (level = walker->level; level <= walker->max_level; level++) { @@ -865,8 +893,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, * we will cache the incorrect access into mmio spte. */ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_write_protection(vcpu) && !user_fault && - !is_noslot_pfn(pfn)) { + !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -876,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, * then we should prevent the kernel from executing it * if SMEP is enabled. */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (is_cr4_smep(vcpu->arch.mmu)) walker.pte_access &= ~ACC_EXEC_MASK; } @@ -1031,13 +1058,36 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { + union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; int set_spte_ret = 0; - /* direct kvm_mmu_page can not be unsync. */ - BUG_ON(sp->role.direct); + /* + * Ignore various flags when verifying that it's safe to sync a shadow + * page using the current MMU context. + * + * - level: not part of the overall MMU role and will never match as the MMU's + * level tracks the root level + * - access: updated based on the new guest PTE + * - quadrant: not part of the overall MMU role (similar to level) + */ + const union kvm_mmu_page_role sync_role_ign = { + .level = 0xf, + .access = 0x7, + .quadrant = 0x3, + }; + + /* + * Direct pages can never be unsync, and KVM should never attempt to + * sync a shadow page for a different MMU context, e.g. if the role + * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the + * reserved bits checks will be wrong, etc... + */ + if (WARN_ON_ONCE(sp->role.direct || + (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 66d43cec0c31..3e97cdb13eb7 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -103,13 +103,6 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; /* - * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set - * if PAE paging may be employed (shadow paging or any 32-bit KVM). - */ - WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) && - (spte & SPTE_TDP_AD_MASK)); - - /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, * ACC_USER_MASK and shadow_user_mask are used to represent @@ -154,13 +147,19 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. + * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ if (!can_unsync && is_writable_pte(old_spte)) goto out; - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + /* + * Unsync shadow pages that are reachable by the new, writable + * SPTE. Write-protect the SPTE if the page can't be unsync'd, + * e.g. it's write-tracked (upper-level SPs) or has one or more + * shadow pages and unsync'ing pages is not allowed. + */ + if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); ret |= SET_SPTE_WRITE_PROTECTED_PT; @@ -176,7 +175,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte = mark_spte_for_access_track(spte); out: - WARN_ON(is_mmio_spte(spte)); + WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), + "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, + get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + *new_spte = spte; return ret; } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index bca0ba11cccf..7a5ce9314107 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -293,6 +293,38 @@ static inline bool is_dirty_spte(u64 spte) return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } +static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, + int level) +{ + int bit7 = (pte >> 7) & 1; + + return rsvd_check->rsvd_bits_mask[bit7][level-1]; +} + +static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, + u64 pte, int level) +{ + return pte & get_rsvd_bits(rsvd_check, pte, level); +} + +static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, + u64 pte) +{ + return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); +} + +static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, + u64 spte, int level) +{ + /* + * Use a bitwise-OR instead of a logical-OR to aggregate the reserved + * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc + * (this is extremely unlikely to be short-circuited as true). + */ + return __is_bad_mt_xwr(rsvd_check, spte) | + __is_rsvd_bits_set(rsvd_check, spte, level); +} + static inline bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & shadow_host_writable_mask) && diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d90eb364d73a..0853370bd811 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -14,10 +14,10 @@ static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return; + return false; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; @@ -25,6 +25,8 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); + + return true; } static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, @@ -335,7 +337,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, for (i = 0; i < PT64_ENT_PER_PAGE; i++) { sptep = rcu_dereference(pt) + i; - gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); + gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); if (shared) { /* @@ -377,12 +379,12 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, WRITE_ONCE(*sptep, REMOVED_SPTE); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level - 1, + old_child_spte, REMOVED_SPTE, level, shared); } kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -912,7 +914,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, kvm_pfn_t pfn, bool prefault) { u64 new_spte; - int ret = 0; + int ret = RET_PF_FIXED; int make_spte_ret = 0; if (unlikely(is_noslot_pfn(pfn))) @@ -949,7 +951,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, rcu_dereference(iter->sptep)); } - if (!prefault) + /* + * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be + * consistent with legacy MMU behavior. + */ + if (ret != RET_PF_SPURIOUS) vcpu->stat.pf_fixed++; return ret; @@ -977,11 +983,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int level; int req_level; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); @@ -1024,7 +1025,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (is_removed_spte(iter.old_spte)) break; - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); child_pt = sp->spt; new_spte = make_nonleaf_spte(child_pt, @@ -1462,15 +1463,22 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t gfn) + gfn_t gfn, int min_level) { struct tdp_iter iter; u64 new_spte; bool spte_set = false; + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, gfn, gfn + 1) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + if (!is_writable_pte(iter.old_spte)) break; @@ -1492,14 +1500,15 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn) + struct kvm_memory_slot *slot, gfn_t gfn, + int min_level) { struct kvm_mmu_page *root; bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root, slot->as_id) - spte_set |= write_protect_gfn(kvm, root, gfn); + spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 5fdf63090451..1cae4485b3bc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -31,7 +31,7 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1); /* * Don't allow yielding, as the caller may have a flush pending. Note, @@ -74,37 +74,40 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, bool flush); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn); + struct kvm_memory_slot *slot, gfn_t gfn, + int min_level); int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); #ifdef CONFIG_X86_64 -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -#else -static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } -static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -#endif -static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { struct kvm_mmu_page *sp; + hpa_t hpa = mmu->root_hpa; - if (!is_tdp_mmu_enabled(kvm)) - return false; if (WARN_ON(!VALID_PAGE(hpa))) return false; + /* + * A NULL shadow page is legal when shadowing a non-paging guest with + * PAE paging, as the MMU will be direct with root_hpa pointing at the + * pae_root page, not a shadow page. + */ sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; - - return is_tdp_mmu_page(sp) && sp->root_count; + return sp && is_tdp_mmu_page(sp) && sp->root_count; } +#else +static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } +#endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ |