diff options
Diffstat (limited to 'arch/x86/kvm/mmu/paging_tmpl.h')
| -rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 761 |
1 files changed, 322 insertions, 439 deletions
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index a6d484ea110b..901cd2bd40b8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -16,61 +16,57 @@ */ /* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. + * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, + * as well as guest EPT tables, so the code in this file is compiled thrice, + * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG cmpxchg #else - #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 2 #endif #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG cmpxchg + + #define PT32_DIR_PSE36_SIZE 4 + #define PT32_DIR_PSE36_SHIFT 13 + #define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 - #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) - #define CMPXCHG cmpxchg64 + #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif +/* Common logic, but per-type values. These also need to be undefined. */ +#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK) +#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) + #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) @@ -90,12 +86,21 @@ struct guest_walker { gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; bool pte_writable[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; + unsigned int pt_access[PT_MAX_FULL_LEVELS]; + unsigned int pte_access; gfn_t gfn; struct x86_exception fault; }; +#if PTTYPE == 32 +static inline gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} +#endif + static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; @@ -143,49 +148,6 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); } -static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t __user *ptep_user, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) -{ - int npages; - pt_element_t ret; - pt_element_t *table; - struct page *page; - - npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); - if (likely(npages == 1)) { - table = kmap_atomic(page); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); - - kvm_release_page_dirty(page); - } else { - struct vm_area_struct *vma; - unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK; - unsigned long pfn; - unsigned long paddr; - - mmap_read_lock(current->mm); - vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); - if (!vma || !(vma->vm_flags & VM_PFNMAP)) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - paddr = pfn << PAGE_SHIFT; - table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); - if (!table) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - ret = CMPXCHG(&table[index], orig_pte, new_pte); - memunmap(table); - mmap_read_unlock(current->mm); - } - - return (ret != orig_pte); -} - static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) @@ -193,7 +155,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, if (!FNAME(is_present_gpte)(gpte)) goto no_present; - /* if accessed bit is not supported prefetch non accessed gpte */ + /* Prefetch only accessed entries (unless A/D bits are disabled). */ if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; @@ -235,7 +197,7 @@ static inline unsigned FNAME(gpte_access)(u64 gpte) static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, - int write_fault) + gpa_t addr, int write_fault) { unsigned level, index; pt_element_t pte, orig_pte; @@ -260,7 +222,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu)) + if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; @@ -284,7 +246,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (unlikely(!walker->pte_writable[level - 1])) continue; - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); + ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); if (ret) return ret; @@ -305,20 +267,49 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) return pkeys; } +static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, + unsigned int level, unsigned int gpte) +{ + /* + * For EPT and PAE paging (both variants), bit 7 is either reserved at + * all level or indicates a huge page (ignoring CR3/EPTP). In either + * case, bit 7 being set terminates the walk. + */ +#if PTTYPE == 32 + /* + * 32-bit paging requires special handling because bit 7 is ignored if + * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is + * greater than the last level for which bit 7 is the PAGE_SIZE bit. + * + * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 + * is not reserved and does not indicate a large page at this level, + * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. + */ + gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); +#endif + /* + * PG_LEVEL_4K always terminates. The RHS has bit 7 set + * iff level <= PG_LEVEL_4K, which for our purpose means + * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PG_LEVEL_4K - 1; + + return gpte & PT_PAGE_SIZE_MASK; +} /* * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t addr, u32 access) + gpa_t addr, u64 access) { int ret; pt_element_t pte; - pt_element_t __user *uninitialized_var(ptep_user); + pt_element_t __user *ptep_user; gfn_t table_gfn; u64 pt_access, pte_access; unsigned index, accessed_dirty, pte_pkey; - unsigned nested_access; + u64 nested_access; gpa_t pte_gpa; bool have_ad; int offset; @@ -332,8 +323,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: - walker->level = mmu->root_level; - pte = mmu->get_guest_pgd(vcpu); + walker->level = mmu->cpu_role.base.level; + pte = kvm_mmu_get_guest_pgd(vcpu, mmu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 @@ -347,7 +338,6 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); /* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging @@ -357,10 +347,21 @@ retry_walk: nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; pte_access = ~0; + + /* + * Queue a page fault for injection if this assertion fails, as callers + * assume that walker.fault contains sane info on a walk failure. I.e. + * avoid making the situation worse by inducing even worse badness + * between when the assertion fails and when KVM kicks the vCPU out to + * userspace (because the VM is bugged). + */ + if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) + goto error; + ++walker->level; do { - gfn_t real_gfn; + struct kvm_memory_slot *slot; unsigned long host_addr; pt_access = pte_access; @@ -375,9 +376,8 @@ retry_walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - nested_access, - &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), + nested_access, &walker->fault); /* * FIXME: This can happen if emulation (for of an INS/OUTS @@ -389,18 +389,20 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gfn == UNMAPPED_GVA)) + if (unlikely(real_gpa == INVALID_GPA)) return 0; - real_gfn = gpa_to_gfn(real_gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); + if (!kvm_is_visible_memslot(slot)) + goto error; - host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn, + host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa), &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(__get_user(pte, ptep_user))) + if (unlikely(get_user(pte, ptep_user))) goto error; walker->ptep_user[walker->level - 1] = ptep_user; @@ -421,13 +423,15 @@ retry_walk: } walker->ptes[walker->level - 1] = pte; - } while (!is_last_gpte(mmu, walker->level, pte)); + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); + } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) @@ -436,11 +440,13 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) +#if PTTYPE == 32 + if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); +#endif - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == UNMAPPED_GVA) + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); + if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; @@ -457,21 +463,19 @@ retry_walk: (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { - ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, + addr, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) goto retry_walk; } - pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, walker->pt_access); return 1; error: errcode |= write_fault | user_fault; - if (fetch_fault && (mmu->nx || - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) + if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; @@ -480,7 +484,7 @@ error: #if PTTYPE == PTTYPE_EPT /* - * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * Use PFERR_RSVD_MASK in error_code to tell if EPT * misconfiguration requires to be injected. The detection is * done by is_rsvd_bits_set() above. * @@ -493,78 +497,52 @@ error: * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { - vcpu->arch.exit_qualification &= 0x180; + walker->fault.exit_qualification = 0; + if (write_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; - vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR; + + /* + * Note, pte_access holds the raw RWX bits from the EPTE, not + * ACC_*_MASK flags! + */ + walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); } #endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; + walker->fault.async_page_fault = false; trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; } static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gpa_t addr, u32 access) + struct kvm_vcpu *vcpu, gpa_t addr, u64 access) { return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); } -#if PTTYPE != PTTYPE_EPT -static int FNAME(walk_addr_nested)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, - addr, access); -} -#endif - static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, pt_element_t gpte, bool no_dirty_log) + u64 *spte, pt_element_t gpte) { unsigned pte_access; gfn_t gfn; - kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (is_error_pfn(pfn)) - return false; - - /* - * we call mmu_set_spte() with host_writable = true because - * pte_prefetch_gfn_to_pfn always gets a writable pfn. - */ - mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, - true, true); - - kvm_release_pfn_clean(pfn); - return true; -} - -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) -{ - pt_element_t gpte = *(const pt_element_t *)pte; - FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); + return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -598,15 +576,22 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, u64 *spte; int i; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) + return; + if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -616,7 +601,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (is_shadow_present_pte(*spte)) continue; - if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) break; } } @@ -626,21 +611,19 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, - int write_fault, int max_level, - kvm_pfn_t pfn, bool map_writable, bool prefault, - bool lpage_disallowed) +static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + struct guest_walker *gw) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; - unsigned direct_access, access = gw->pt_access; - int top_level, hlevel, ret; - gfn_t base_gfn = gw->gfn; + unsigned int direct_access, access; + int top_level, ret; + gfn_t base_gfn = fault->gfn; + WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; - top_level = vcpu->arch.mmu->root_level; + top_level = vcpu->arch.mmu->cpu_role.base.level; if (top_level == PT32E_ROOT_LEVEL) top_level = PT32_ROOT_LEVEL; /* @@ -650,115 +633,119 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * really care if it changes underneath us after this point). */ if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; + return RET_PF_RETRY; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - goto out_gpte_changed; + if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + return RET_PF_RETRY; + + /* + * Load a new root and retry the faulting instruction in the extremely + * unlikely scenario that the guest root gfn became visible between + * loading a dummy root and handling the resulting page fault, e.g. if + * userspace create a memslot in the interim. + */ + if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { + kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); + return RET_PF_RETRY; + } - for (shadow_walk_init(&it, vcpu, addr); - shadow_walk_okay(&it) && it.level > gw->level; - shadow_walk_next(&it)) { + for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); + if (it.level == gw->level) + break; - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access); - } + table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); /* - * Verify that the gpte in the page we've just write - * protected is still there. + * Synchronize the new page before linking it, as the CPU (KVM) + * is architecturally disallowed from inserting non-present + * entries into the TLB, i.e. the guest isn't required to flush + * the TLB when changing the gPTE from non-present to present. + * + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already + * synchronized the page via kvm_sync_page(). + * + * For higher level pages, which cannot be unsync themselves + * but can have unsync children, synchronize via the slower + * mmu_sync_children(). If KVM needs to drop mmu_lock due to + * contention or to reschedule, instruct the caller to retry + * the #PF (mmu_sync_children() ensures forward progress will + * be made). + */ + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; + + /* + * Verify that the gpte in the page, which is now either + * write-protected or unsync, wasn't modified between the fault + * and acquiring mmu_lock. This needs to be done even when + * reusing an existing shadow page to ensure the information + * gathered by the walker matches the information stored in the + * shadow page (which could have been modified by a different + * vCPU even if the page was already linked). Holding mmu_lock + * prevents the shadow page from changing after this point. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; + return RET_PF_RETRY; - if (sp) + if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); + + if (fault->write && table_gfn == fault->gfn) + fault->write_fault_to_shadow_pgtable = true; } - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); + /* + * Adjust the hugepage size _after_ resolving indirect shadow pages. + * KVM doesn't support mapping hugepages into the guest for gfns that + * are being shadowed by KVM, i.e. allocating a new shadow page may + * affect the allowed hugepage size. + */ + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { - clear_sp_write_flooding_count(it.sptep); - /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == hlevel) + base_gfn = gfn_round_for_level(fault->gfn, it.level); + if (it.level == fault->goal_level) break; validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); + if (sp == ERR_PTR(-EEXIST)) + continue; - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, addr, - it.level - 1, true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) - account_huge_nx_page(vcpu->kvm, sp); - } + link_shadow_page(vcpu, it.sptep, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } - ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, base_gfn, pfn, prefault, map_writable); - FNAME(pte_prefetch)(vcpu, gw, it.sptep); - ++vcpu->stat.pf_fixed; - return ret; - -out_gpte_changed: - return RET_PF_RETRY; -} - - /* - * To see whether the mapped gfn can write its page table in the current - * mapping. - * - * It is the helper function of FNAME(page_fault). When guest uses large page - * size to map the writable gfn which is used as current page table, we should - * force kvm to use small page size to map it because new shadow page will be - * created when kvm establishes shadow page table that stop kvm using large - * page size. Do it early can avoid unnecessary #PF and emulation. - * - * @write_fault_to_shadow_pgtable will return true if the fault gfn is - * currently used as its page table. - * - * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok - * since the PDPT is always shadowed, that means, we can not use large page - * size to map the gfn which is used as PDPT. - */ -static bool -FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault, - bool *write_fault_to_shadow_pgtable) -{ - int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); - bool self_changed = false; - - if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_write_protection(vcpu) && !user_fault))) - return false; - - for (level = walker->level; level <= walker->max_level; level++) { - gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; - self_changed |= !(gfn & mask); - *write_fault_to_shadow_pgtable |= !gfn; - } + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, + base_gfn, fault->pfn, fault); + if (ret == RET_PF_SPURIOUS) + return ret; - return self_changed; + FNAME(pte_prefetch)(vcpu, gw, it.sptep); + return ret; } /* @@ -775,80 +762,57 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - kvm_pfn_t pfn; - unsigned long mmu_seq; - bool map_writable, is_self_change_mapping; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); - int max_level; - - pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; + WARN_ON_ONCE(fault->is_tdp); /* + * Look up the guest pte for the faulting address. * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. */ - error_code &= ~PFERR_RSVD_MASK; - - /* - * Look up the guest pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); + r = FNAME(walk_addr)(&walker, vcpu, fault->addr, + fault->error_code & ~PFERR_RSVD_MASK); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { - pgprintk("%s: guest page fault\n", __func__); - if (!prefault) + if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } - if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { - shadow_page_table_clear_flood(vcpu, addr); - return RET_PF_EMULATE; - } - - vcpu->arch.write_fault_to_shadow_pgtable = false; - - is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + fault->gfn = walker.gfn; + fault->max_level = walker.level; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); - if (lpage_disallowed || is_self_change_mapping) - max_level = PG_LEVEL_4K; - else - max_level = walker.level; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); + if (page_fault_handle_page_track(vcpu, fault)) { + shadow_page_table_clear_flood(vcpu, fault->addr); + return RET_PF_WRITE_PROTECTED; + } - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) - return RET_PF_RETRY; + r = mmu_topup_memory_caches(vcpu, true); + if (r) + return r; - if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) + r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); + if (r != RET_PF_CONTINUE) return r; +#if PTTYPE != PTTYPE_EPT /* - * Do not change pte_access if the pfn is a mmio page, otherwise - * we will cache the incorrect access into mmio spte. + * Treat the guest PTE protections as writable, supervisor-only if this + * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore + * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO, + * otherwise KVM will cache incorrect access information in the SPTE. */ - if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_write_protection(vcpu) && !user_fault && - !is_noslot_pfn(pfn)) { + if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && + !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -858,25 +822,25 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, * then we should prevent the kernel from executing it * if SMEP is enabled. */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (is_cr4_smep(vcpu->arch.mmu)) walker.pte_access &= ~ACC_EXEC_MASK; } +#endif r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + write_lock(&vcpu->kvm->mmu_lock); + + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; - kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, - map_writable, prefault, lpage_disallowed); - kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); + r = FNAME(fetch)(vcpu, fault, &walker); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_mmu_finish_page_fault(vcpu, fault, r); + write_unlock(&vcpu->kvm->mmu_lock); return r; } @@ -884,201 +848,121 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; + offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) -{ - struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - int level; - u64 *sptep; - - vcpu_clear_mmio_info(vcpu, gva); - - /* - * No need to check return value here, rmap_can_add() can - * help us to skip pte prefetch later. - */ - mmu_topup_memory_caches(vcpu); - - if (!VALID_PAGE(root_hpa)) { - WARN_ON(1); - return; - } - - spin_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { - level = iterator.level; - sptep = iterator.sptep; - - sp = page_header(__pa(sptep)); - if (is_last_spte(*sptep, level)) { - pt_element_t gpte; - gpa_t pte_gpa; - - if (!sp->unsync) - break; - - pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - - if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, - sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); - - if (!rmap_can_add(vcpu)) - break; - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - break; - - FNAME(update_pte)(vcpu, sp, sptep, &gpte); - } - - if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) - break; - } - spin_unlock(&vcpu->kvm->mmu_lock); -} - /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t addr, u64 access, struct x86_exception *exception) { struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - - r = FNAME(walk_addr)(&walker, vcpu, addr, access); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= addr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; - - return gpa; -} - -#if PTTYPE != PTTYPE_EPT -/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, - struct x86_exception *exception) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; + gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 /* A 64-bit GVA should be impossible on 32-bit KVM. */ - WARN_ON_ONCE(vaddr >> 32); + WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); #endif - r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; + gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; return gpa; } -#endif /* - * Using the cached information from sp->gfns is safe because: - * - The spte has a reference to the struct page, so the pfn for a given gfn - * can't change unless all sptes pointing to it are nuked first. + * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is + * safe because SPTEs are protected by mmu_notifiers and memslot generations, so + * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are + * nuked first. * - * Note: - * We should flush all tlbs if spte is dropped even though guest is - * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page - * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't - * used by guest then tlbs are not flushed, so guest is allowed to access the - * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + * Returns + * < 0: failed to sync spte + * 0: the spte is synced and no tlb flushing is required + * > 0: the spte is synced and tlb flushing is required */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; - int set_spte_ret = 0; + u64 *sptep, spte; + struct kvm_memory_slot *slot; + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn; - /* direct kvm_mmu_page can not be unsync. */ - BUG_ON(sp->role.direct); + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE || + !sp->shadowed_translation)) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - unsigned pte_access; - pt_element_t gpte; - gpa_t pte_gpa; - gfn_t gfn; + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -1; - if (!sp->spt[i]) - continue; + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) + return 1; - pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - return 0; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - /* - * Update spte before increasing tlbs_dirty to make - * sure no tlb flush is lost after spte is zapped; see - * the comments in kvm_flush_remote_tlbs(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; - continue; - } - - gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, - &nr_present)) - continue; - - if (gfn != sp->gfns[i]) { - drop_spte(vcpu->kvm, &sp->spt[i]); - /* - * The same as above where we are doing - * prefetch_invalid_gpte(). - */ - smp_wmb(); - vcpu->kvm->tlbs_dirty++; - continue; - } - - nr_present++; + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access; + pte_access &= FNAME(gpte_access)(gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) + return 0; - set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PG_LEVEL_4K, - gfn, spte_to_pfn(sp->spt[i]), - true, false, host_writable); + /* + * Drop the SPTE if the new protections result in no effective + * "present" bit or if the gfn is changing. The former case + * only affects EPT with execute-only support with pte_access==0; + * all other paging modes will create a read-only SPTE if + * pte_access is zero. + */ + if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE || + gfn != kvm_mmu_page_get_gfn(sp, i)) { + drop_spte(vcpu->kvm, &sp->spt[i]); + return 1; } + /* + * Do nothing if the permissions are unchanged. The existing SPTE is + * still, and prefetch_invalid_gpte() has verified that the A/D bits + * are set in the "new" gPTE, i.e. there is no danger of missing an A/D + * update due to A/D bits being set in the SPTE but not the gPTE. + */ + if (kvm_mmu_page_get_access(sp, i) == pte_access) + return 0; + + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) - kvm_flush_remote_tlbs(vcpu->kvm); + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, true, + host_writable, &spte); - return nr_present; + /* + * There is no need to mark the pfn dirty, as the new protections must + * be a subset of the old protections, i.e. synchronizing a SPTE cannot + * change the SPTE from read-only to writable. + */ + return mmu_spte_update(sptep, spte); } #undef pt_element_t @@ -1092,7 +976,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl -#undef CMPXCHG #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT |
