summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu/paging_tmpl.h
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu/paging_tmpl.h')
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h173
1 files changed, 95 insertions, 78 deletions
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 0662e0278e70..901cd2bd40b8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -62,7 +62,7 @@
#endif
/* Common logic, but per-type values. These also need to be undefined. */
-#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
+#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK)
#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
@@ -338,7 +338,6 @@ retry_walk:
}
#endif
walker->max_level = walker->level;
- ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
/*
* FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
@@ -348,9 +347,21 @@ retry_walk:
nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
pte_access = ~0;
+
+ /*
+ * Queue a page fault for injection if this assertion fails, as callers
+ * assume that walker.fault contains sane info on a walk failure. I.e.
+ * avoid making the situation worse by inducing even worse badness
+ * between when the assertion fails and when KVM kicks the vCPU out to
+ * userspace (because the VM is bugged).
+ */
+ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
+ goto error;
+
++walker->level;
do {
+ struct kvm_memory_slot *slot;
unsigned long host_addr;
pt_access = pte_access;
@@ -381,13 +392,17 @@ retry_walk:
if (unlikely(real_gpa == INVALID_GPA))
return 0;
- host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
+ if (!kvm_is_visible_memslot(slot))
+ goto error;
+
+ host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
&walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
- if (unlikely(__get_user(pte, ptep_user)))
+ if (unlikely(get_user(pte, ptep_user)))
goto error;
walker->ptep_user[walker->level - 1] = ptep_user;
@@ -456,9 +471,6 @@ retry_walk:
goto retry_walk;
}
- pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access,
- walker->pt_access[walker->level - 1]);
return 1;
error:
@@ -485,21 +497,20 @@ error:
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
- EPT_VIOLATION_GVA_TRANSLATED);
+ walker->fault.exit_qualification = 0;
+
if (write_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
if (user_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
if (fetch_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
/*
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
- vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
- EPT_VIOLATION_RWX_SHIFT;
+ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
}
#endif
walker->fault.address = addr;
@@ -521,31 +532,17 @@ static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, pt_element_t gpte)
{
- struct kvm_memory_slot *slot;
unsigned pte_access;
gfn_t gfn;
- kvm_pfn_t pfn;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
- pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
- if (!slot)
- return false;
-
- pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
- if (is_error_pfn(pfn))
- return false;
-
- mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
- kvm_release_pfn_clean(pfn);
- return true;
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access);
}
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -636,10 +633,21 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* really care if it changes underneath us after this point).
*/
if (FNAME(gpte_changed)(vcpu, gw, top_level))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
- goto out_gpte_changed;
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ /*
+ * Load a new root and retry the faulting instruction in the extremely
+ * unlikely scenario that the guest root gfn became visible between
+ * loading a dummy root and handling the resulting page fault, e.g. if
+ * userspace create a memslot in the interim.
+ */
+ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
+ kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
+ return RET_PF_RETRY;
+ }
for_each_shadow_entry(vcpu, fault->addr, it) {
gfn_t table_gfn;
@@ -653,34 +661,38 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
false, access);
- if (sp != ERR_PTR(-EEXIST)) {
- /*
- * We must synchronize the pagetable before linking it
- * because the guest doesn't need to flush tlb when
- * the gpte is changed from non-present to present.
- * Otherwise, the guest may use the wrong mapping.
- *
- * For PG_LEVEL_4K, kvm_mmu_get_page() has already
- * synchronized it transiently via kvm_sync_page().
- *
- * For higher level pagetable, we synchronize it via
- * the slower mmu_sync_children(). If it needs to
- * break, some progress has been made; return
- * RET_PF_RETRY and retry on the next #PF.
- * KVM_REQ_MMU_SYNC is not necessary but it
- * expedites the process.
- */
- if (sp->unsync_children &&
- mmu_sync_children(vcpu, sp, false))
- return RET_PF_RETRY;
- }
+ /*
+ * Synchronize the new page before linking it, as the CPU (KVM)
+ * is architecturally disallowed from inserting non-present
+ * entries into the TLB, i.e. the guest isn't required to flush
+ * the TLB when changing the gPTE from non-present to present.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already
+ * synchronized the page via kvm_sync_page().
+ *
+ * For higher level pages, which cannot be unsync themselves
+ * but can have unsync children, synchronize via the slower
+ * mmu_sync_children(). If KVM needs to drop mmu_lock due to
+ * contention or to reschedule, instruct the caller to retry
+ * the #PF (mmu_sync_children() ensures forward progress will
+ * be made).
+ */
+ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
/*
- * Verify that the gpte in the page we've just write
- * protected is still there.
+ * Verify that the gpte in the page, which is now either
+ * write-protected or unsync, wasn't modified between the fault
+ * and acquiring mmu_lock. This needs to be done even when
+ * reusing an existing shadow page to ensure the information
+ * gathered by the walker matches the information stored in the
+ * shadow page (which could have been modified by a different
+ * vCPU even if the page was already linked). Holding mmu_lock
+ * prevents the shadow page from changing after this point.
*/
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
@@ -734,9 +746,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return ret;
-
-out_gpte_changed:
- return RET_PF_RETRY;
}
/*
@@ -758,7 +767,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
struct guest_walker walker;
int r;
- pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
/*
@@ -773,7 +781,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
- pgprintk("%s: guest page fault\n", __func__);
if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
@@ -786,20 +793,23 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (page_fault_handle_page_track(vcpu, fault)) {
shadow_page_table_clear_flood(vcpu, fault->addr);
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
}
r = mmu_topup_memory_caches(vcpu, true);
if (r)
return r;
- r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
+ r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
return r;
+#if PTTYPE != PTTYPE_EPT
/*
- * Do not change pte_access if the pfn is a mmio page, otherwise
- * we will cache the incorrect access into mmio spte.
+ * Treat the guest PTE protections as writable, supervisor-only if this
+ * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
+ * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
+ * otherwise KVM will cache incorrect access information in the SPTE.
*/
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@@ -815,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
+#endif
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
@@ -828,8 +839,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = FNAME(fetch)(vcpu, fault, &walker);
out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -837,7 +848,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
int offset = 0;
- WARN_ON(sp->role.level != PG_LEVEL_4K);
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
if (PTTYPE == 32)
offset = sp->role.quadrant << SPTE_LEVEL_BITS;
@@ -872,9 +883,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
/*
* Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
- * safe because:
- * - The spte has a reference to the struct page, so the pfn for a given gfn
- * can't change unless all sptes pointing to it are nuked first.
+ * safe because SPTEs are protected by mmu_notifiers and memslot generations, so
+ * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are
+ * nuked first.
*
* Returns
* < 0: failed to sync spte
@@ -892,7 +903,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(!sp->spt[i]))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
+ !sp->shadowed_translation))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
@@ -914,13 +926,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
return 0;
/*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
*/
- if ((!pte_access && !shadow_present_mask) ||
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]);
return 1;
@@ -942,9 +954,14 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
host_writable = spte & shadow_host_writable_mask;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
make_spte(vcpu, sp, slot, pte_access, gfn,
- spte_to_pfn(spte), spte, true, false,
+ spte_to_pfn(spte), spte, true, true,
host_writable, &spte);
+ /*
+ * There is no need to mark the pfn dirty, as the new protections must
+ * be a subset of the old protections, i.e. synchronizing a SPTE cannot
+ * change the SPTE from read-only to writable.
+ */
return mmu_spte_update(sptep, spte);
}