diff options
Diffstat (limited to 'arch/powerpc/kvm/book3s_64_mmu_radix.c')
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_radix.c | 292 |
1 files changed, 190 insertions, 102 deletions
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index da857c8ba6e4..4a1abb9f7c05 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -11,16 +11,19 @@ #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/debugfs.h> +#include <linux/pgtable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#include "book3s_hv.h" #include <asm/page.h> #include <asm/mmu.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/pte-walk.h> #include <asm/ultravisor.h> #include <asm/kvm_book3s_uvmem.h> +#include <asm/plpar_wrappers.h> +#include <asm/firmware.h> /* * Supported radix tree geometry. @@ -33,14 +36,21 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, gva_t eaddr, void *to, void *from, unsigned long n) { - int uninitialized_var(old_pid), old_lpid; + int old_pid, old_lpid; unsigned long quadrant, ret = n; bool is_load = !!to; + if (kvmhv_is_nestedv2()) + return H_UNSUPPORTED; + /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ if (kvmhv_on_pseries()) return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, - __pa(to), __pa(from), n); + (to != NULL) ? __pa(to): 0, + (from != NULL) ? __pa(from): 0, n); + + if (eaddr & (0xFFFUL << 52)) + return ret; quadrant = 1; if (!pid) @@ -52,6 +62,8 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, preempt_disable(); + asm volatile("hwsync" ::: "memory"); + isync(); /* switch the lpid first to avoid running host with unallocated pid */ old_lpid = mfspr(SPRN_LPID); if (old_lpid != lpid) @@ -65,11 +77,13 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, pagefault_disable(); if (is_load) - ret = raw_copy_from_user(to, from, n); + ret = __copy_from_user_inatomic(to, (const void __user *)from, n); else - ret = raw_copy_to_user(to, from, n); + ret = __copy_to_user_inatomic((void __user *)to, from, n); pagefault_enable(); + asm volatile("hwsync" ::: "memory"); + isync(); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid) mtspr(SPRN_PID, old_pid); @@ -81,13 +95,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, return ret; } -EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, void *from, unsigned long n) { int lpid = vcpu->kvm->arch.lpid; - int pid = vcpu->arch.pid; + int pid; /* This would cause a data segment intr so don't allow the access */ if (eaddr & (0x3FFUL << 52)) @@ -100,6 +113,8 @@ static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, /* If accessing quadrant 3 then pid is expected to be 0 */ if (((eaddr >> 62) & 0x3) == 0x3) pid = 0; + else + pid = kvmppc_get_pid(vcpu); eaddr &= ~(0xFFFUL << 52); @@ -117,14 +132,12 @@ long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, return ret; } -EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, unsigned long n) { return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); } -EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, u64 root, @@ -162,7 +175,10 @@ int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, return -EINVAL; /* Read the entry from guest memory */ addr = base + (index * sizeof(rpte)); + + kvm_vcpu_srcu_read_lock(vcpu); ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); + kvm_vcpu_srcu_read_unlock(vcpu); if (ret) { if (pte_ret_p) *pte_ret_p = addr; @@ -238,7 +254,9 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, /* Read the table to find the root of the radix tree */ ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); + kvm_vcpu_srcu_read_lock(vcpu); ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); + kvm_vcpu_srcu_read_unlock(vcpu); if (ret) return ret; @@ -258,7 +276,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, /* Work out effective PID */ switch (eaddr >> 62) { case 0: - pid = vcpu->arch.pid; + pid = kvmppc_get_pid(vcpu); break; case 3: pid = 0; @@ -282,9 +300,9 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, } else { if (!(pte & _PAGE_PRIVILEGED)) { /* Check AMR/IAMR to see if strict mode is in force */ - if (vcpu->arch.amr & (1ul << 62)) + if (kvmppc_get_amr_hv(vcpu) & (1ul << 62)) gpte->may_read = 0; - if (vcpu->arch.amr & (1ul << 63)) + if (kvmppc_get_amr_hv(vcpu) & (1ul << 63)) gpte->may_write = 0; if (vcpu->arch.iamr & (1ul << 62)) gpte->may_execute = 0; @@ -295,7 +313,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, } void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, - unsigned int pshift, unsigned int lpid) + unsigned int pshift, u64 lpid) { unsigned long psize = PAGE_SIZE; int psi; @@ -315,14 +333,24 @@ void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, } psi = shift_to_mmu_psize(pshift); - rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); - rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), - lpid, rb); + + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) { + rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), + lpid, rb); + } else { + rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU, + H_RPTI_TYPE_NESTED | + H_RPTI_TYPE_TLB, + psize_to_rpti_pgsize(psi), + addr, addr + psize); + } + if (rc) pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); } -static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) +static void kvmppc_radix_flush_pwc(struct kvm *kvm, u64 lpid) { long rc; @@ -331,8 +359,14 @@ static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) return; } - rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), - lpid, TLBIEL_INVAL_SET_LPID); + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), + lpid, TLBIEL_INVAL_SET_LPID); + else + rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU, + H_RPTI_TYPE_NESTED | + H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL, + 0, -1UL); if (rc) pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); } @@ -344,7 +378,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, return __radix_pte_update(ptep, clr, set); } -void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, +static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, pte_t *ptep, pte_t pte) { radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); @@ -355,7 +389,13 @@ static struct kmem_cache *kvm_pmd_cache; static pte_t *kvmppc_pte_alloc(void) { - return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); + pte_t *pte; + + pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); + /* pmd_populate() will only reference _pa(pte). */ + kmemleak_ignore(pte); + + return pte; } static void kvmppc_pte_free(pte_t *ptep) @@ -365,7 +405,13 @@ static void kvmppc_pte_free(pte_t *ptep) static pmd_t *kvmppc_pmd_alloc(void) { - return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL); + pmd_t *pmd; + + pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL); + /* pud_populate() will only reference _pa(pmd). */ + kmemleak_ignore(pmd); + + return pmd; } static void kvmppc_pmd_free(pmd_t *pmdp) @@ -377,7 +423,7 @@ static void kvmppc_pmd_free(pmd_t *pmdp) void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, unsigned int shift, const struct kvm_memory_slot *memslot, - unsigned int lpid) + u64 lpid) { unsigned long old; @@ -419,15 +465,19 @@ void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, * Callers are responsible for flushing the PWC. * * When page tables are being unmapped/freed as part of page fault path - * (full == false), ptes are not expected. There is code to unmap them - * and emit a warning if encountered, but there may already be data - * corruption due to the unexpected mappings. + * (full == false), valid ptes are generally not expected; however, there + * is one situation where they arise, which is when dirty page logging is + * turned off for a memslot while the VM is running. The new memslot + * becomes visible to page faults before the memslot commit function + * gets to flush the memslot, which can lead to a 2MB page mapping being + * installed for a guest physical address where there are already 64kB + * (or 4kB) mappings (of sub-pages of the same 2MB page). */ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, - unsigned int lpid) + u64 lpid) { if (full) { - memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); + memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE); } else { pte_t *p = pte; unsigned long it; @@ -435,7 +485,6 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { if (pte_val(*p) == 0) continue; - WARN_ON_ONCE(1); kvmppc_unmap_pte(kvm, p, pte_pfn(*p) << PAGE_SHIFT, PAGE_SHIFT, NULL, lpid); @@ -446,7 +495,7 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, } static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, - unsigned int lpid) + u64 lpid) { unsigned long im; pmd_t *p = pmd; @@ -466,7 +515,7 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, } else { pte_t *pte; - pte = pte_offset_map(p, 0); + pte = pte_offset_kernel(p, 0); kvmppc_unmap_free_pte(kvm, pte, full, lpid); pmd_clear(p); } @@ -475,7 +524,7 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, } static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, - unsigned int lpid) + u64 lpid) { unsigned long iu; pud_t *p = pud; @@ -496,18 +545,19 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, pud_free(kvm->mm, pud); } -void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) +void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, u64 lpid) { unsigned long ig; for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { + p4d_t *p4d = p4d_offset(pgd, 0); pud_t *pud; - if (!pgd_present(*pgd)) + if (!p4d_present(*p4d)) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(p4d, 0); kvmppc_unmap_free_pud(kvm, pud, lpid); - pgd_clear(pgd); + p4d_clear(p4d); } } @@ -522,7 +572,7 @@ void kvmppc_free_radix(struct kvm *kvm) } static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, - unsigned long gpa, unsigned int lpid) + unsigned long gpa, u64 lpid) { pte_t *pte = pte_offset_kernel(pmd, 0); @@ -538,7 +588,7 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, } static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, - unsigned long gpa, unsigned int lpid) + unsigned long gpa, u64 lpid) { pmd_t *pmd = pmd_offset(pud, 0); @@ -564,10 +614,11 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, unsigned long gpa, unsigned int level, - unsigned long mmu_seq, unsigned int lpid, + unsigned long mmu_seq, u64 lpid, unsigned long *rmapp, struct rmap_nested **n_rmap) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud, *new_pud = NULL; pmd_t *pmd, *new_pmd = NULL; pte_t *ptep, *new_ptep = NULL; @@ -575,9 +626,11 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Traverse the guest's 2nd-level tree, allocate new levels needed */ pgd = pgtable + pgd_index(gpa); + p4d = p4d_offset(pgd, gpa); + pud = NULL; - if (pgd_present(*pgd)) - pud = pud_offset(pgd, gpa); + if (p4d_present(*p4d)) + pud = pud_offset(p4d, gpa); else new_pud = pud_alloc_one(kvm->mm, gpa); @@ -593,18 +646,18 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Check if we might have been invalidated; let the guest retry if so */ spin_lock(&kvm->mmu_lock); ret = -EAGAIN; - if (mmu_notifier_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; /* Now traverse again under the lock and change the tree */ ret = -ENOMEM; - if (pgd_none(*pgd)) { + if (p4d_none(*p4d)) { if (!new_pud) goto out_unlock; - pgd_populate(kvm->mm, pgd, new_pud); + p4d_populate(kvm->mm, p4d, new_pud); new_pud = NULL; } - pud = pud_offset(pgd, gpa); + pud = pud_offset(p4d, gpa); if (pud_is_leaf(*pud)) { unsigned long hgpa = gpa & PUD_MASK; @@ -737,8 +790,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, return ret; } -bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, - unsigned long gpa, unsigned int lpid) +bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, + unsigned long gpa, u64 lpid) { unsigned long pgflags; unsigned int shift; @@ -752,12 +805,12 @@ bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, pgflags = _PAGE_ACCESSED; if (writing) pgflags |= _PAGE_DIRTY; - /* - * We are walking the secondary (partition-scoped) page table here. - * We can do this without disabling irq because the Linux MM - * subsystem doesn't do THP splits and collapses on this tree. - */ - ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); + + if (nested) + ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift); + else + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); + if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); return true; @@ -783,7 +836,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, bool large_enable; /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; + mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* @@ -793,14 +846,14 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { upgrade_write = true; } else { unsigned long pfn; /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p); + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, + writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; @@ -815,20 +868,21 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * Read the PTE from the process' radix tree and use that * so we get the shift and attribute bits. */ - local_irq_disable(); - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + spin_lock(&kvm->mmu_lock); + ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); + pte = __pte(0); + if (ptep) + pte = READ_ONCE(*ptep); + spin_unlock(&kvm->mmu_lock); /* * If the PTE disappeared temporarily due to a THP * collapse, just return and let the guest try again. */ - if (!ptep) { - local_irq_enable(); + if (!pte_present(pte)) { if (page) put_page(page); return RESUME_GUEST; } - pte = *ptep; - local_irq_enable(); /* If we're logging dirty pages, always map single pages */ large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); @@ -888,7 +942,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, return ret; } -int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, +int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; @@ -906,7 +960,9 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (dsisr & DSISR_BADACCESS) { /* Reflect to the guest as DSI */ pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + kvmppc_core_queue_data_storage(vcpu, + kvmppc_get_msr(vcpu) & SRR1_PREFIXED, + ea, dsisr); return RESUME_GUEST; } @@ -931,17 +987,20 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * Bad address in guest page table tree, or other * unusual error - reflect it to the guest as DSI. */ - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + kvmppc_core_queue_data_storage(vcpu, + kvmppc_get_msr(vcpu) & SRR1_PREFIXED, + ea, dsisr); return RESUME_GUEST; } - return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); + return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing); } if (memslot->flags & KVM_MEM_READONLY) { if (writing) { /* give the guest a DSI */ - kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | - DSISR_PROTFAULT); + kvmppc_core_queue_data_storage(vcpu, + kvmppc_get_msr(vcpu) & SRR1_PREFIXED, + ea, DSISR_ISSTORE | DSISR_PROTFAULT); return RESUME_GUEST; } kvm_ro = true; @@ -950,8 +1009,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Failed to set the reference/change bits */ if (dsisr & DSISR_SET_RC) { spin_lock(&kvm->mmu_lock); - if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, - writing, gpa, kvm->arch.lpid)) + if (kvmppc_hv_handle_set_rc(kvm, false, writing, + gpa, kvm->arch.lpid)) dsisr &= ~DSISR_SET_RC; spin_unlock(&kvm->mmu_lock); @@ -970,8 +1029,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } /* Called with kvm->mmu_lock held */ -int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; @@ -979,30 +1038,29 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); - return 0; + return; } - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return 0; } /* Called with kvm->mmu_lock held */ -int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; unsigned long old, *rmapp; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ref; - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) { old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, gpa, shift); @@ -1012,26 +1070,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, old & PTE_RPN_MASK, 1UL << shift); - ref = 1; + ref = true; } return ref; } /* Called with kvm->mmu_lock held */ -int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) + { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ref; - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) - ref = 1; + ref = true; return ref; } @@ -1041,7 +1100,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, { unsigned long gfn = memslot->base_gfn + pagenum; unsigned long gpa = gfn << PAGE_SHIFT; - pte_t *ptep; + pte_t *ptep, pte; unsigned int shift; int ret = 0; unsigned long old, *rmapp; @@ -1049,12 +1108,35 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ret; - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); - if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { - ret = 1; - if (shift) - ret = 1 << (shift - PAGE_SHIFT); + /* + * For performance reasons we don't hold kvm->mmu_lock while walking the + * partition scoped table. + */ + ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift); + if (!ptep) + return 0; + + pte = READ_ONCE(*ptep); + if (pte_present(pte) && pte_dirty(pte)) { spin_lock(&kvm->mmu_lock); + /* + * Recheck the pte again + */ + if (pte_val(pte) != pte_val(*ptep)) { + /* + * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can + * only find PAGE_SIZE pte entries here. We can continue + * to use the pte addr returned by above page table + * walk. + */ + if (!pte_present(*ptep) || !pte_dirty(*ptep)) { + spin_unlock(&kvm->mmu_lock); + return 0; + } + } + + ret = 1; + VM_BUG_ON(shift); old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); @@ -1102,7 +1184,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, unsigned int shift; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) - kvmppc_uvmem_drop_pages(memslot, kvm); + kvmppc_uvmem_drop_pages(memslot, kvm, true); if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return; @@ -1110,12 +1192,17 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, gpa = memslot->base_gfn << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); for (n = memslot->npages; n; --n) { - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); gpa += PAGE_SIZE; } + /* + * Increase the mmu notifier sequence number to prevent any page + * fault that read the memslot earlier from writing a PTE. + */ + kvm->mmu_invalidate_seq++; spin_unlock(&kvm->mmu_lock); } @@ -1221,7 +1308,8 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, unsigned long gpa; pgd_t *pgt; struct kvm_nested_guest *nested; - pgd_t pgd, *pgdp; + pgd_t *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ptep; @@ -1294,13 +1382,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, } pgdp = pgt + pgd_index(gpa); - pgd = READ_ONCE(*pgdp); - if (!(pgd_val(pgd) & _PAGE_PRESENT)) { - gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; + p4dp = p4d_offset(pgdp, gpa); + p4d = READ_ONCE(*p4dp); + if (!(p4d_val(p4d) & _PAGE_PRESENT)) { + gpa = (gpa & P4D_MASK) + P4D_SIZE; continue; } - pudp = pud_offset(&pgd, gpa); + pudp = pud_offset(&p4d, gpa); pud = READ_ONCE(*pudp); if (!(pud_val(pud) & _PAGE_PRESENT)) { gpa = (gpa & PUD_MASK) + PUD_SIZE; @@ -1378,9 +1467,8 @@ static const struct file_operations debugfs_radix_fops = { void kvmhv_radix_debugfs_init(struct kvm *kvm) { - kvm->arch.radix_dentry = debugfs_create_file("radix", 0400, - kvm->arch.debugfs_dir, kvm, - &debugfs_radix_fops); + debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm, + &debugfs_radix_fops); } int kvmppc_radix_init(void) |