diff options
Diffstat (limited to 'arch/powerpc/mm/book3s64')
21 files changed, 3592 insertions, 1418 deletions
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index fd393b8be14f..cad2abc1730f 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -2,22 +2,34 @@ ccflags-y := $(NO_MINIMAL_TOC) +obj-y += mmu_context.o pgtable.o trace.o +ifdef CONFIG_PPC_64S_HASH_MMU CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - -obj-y += hash_pgtable.o hash_utils.o slb.o \ - mmu_context.o pgtable.o hash_tlb.o -obj-$(CONFIG_PPC_NATIVE) += hash_native.o -obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o +obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o -obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +endif + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o -obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o +obj-$(CONFIG_PPC_PKEY) += pkeys.o # Instrumenting the SLB fault path can lead to duplicate SLB entries KCOV_INSTRUMENT_slb.o := n + +# Parts of these can run in real mode and therefore are +# not safe with the current outline KASAN implementation +KASAN_SANITIZE_mmu_context.o := n +KASAN_SANITIZE_pgtable.o := n +KASAN_SANITIZE_radix_pgtable.o := n +KASAN_SANITIZE_radix_tlb.o := n +KASAN_SANITIZE_slb.o := n +KASAN_SANITIZE_pkeys.o := n diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c index 22e787123cdf..02acbfd05b46 100644 --- a/arch/powerpc/mm/book3s64/hash_4k.c +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize, int subpg_prot) @@ -54,7 +56,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, * PP bits. _PAGE_USER is already PP bit 0x2, so we only * need to add in 0x1 if it's a read-only user page */ - rflags = htab_convert_pte_flags(new_pte); + rflags = htab_convert_pte_flags(new_pte, flags); rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); if (cpu_has_feature(CPU_FTR_NOEXECUTE) && @@ -118,6 +120,9 @@ repeat: } new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c index 7084ce2951e6..954af420f358 100644 --- a/arch/powerpc/mm/book3s64/hash_64k.c +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + /* * Return true, if the entry has a slot value which * the software considers as invalid. @@ -72,7 +74,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, * Handle the subpage protection bits */ subpg_pte = new_pte & ~subpg_prot; - rflags = htab_convert_pte_flags(subpg_pte); + rflags = htab_convert_pte_flags(subpg_pte, flags); if (cpu_has_feature(CPU_FTR_NOEXECUTE) && !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { @@ -216,6 +218,9 @@ repeat: new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); new_pte |= H_PAGE_HASHPTE; + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -260,7 +265,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, new_pte |= _PAGE_DIRTY; } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - rflags = htab_convert_pte_flags(new_pte); + rflags = htab_convert_pte_flags(new_pte, flags); rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); if (cpu_has_feature(CPU_FTR_NOEXECUTE) && @@ -327,7 +332,12 @@ repeat: new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; } diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c index 440823797de7..c0fabe6c5a12 100644 --- a/arch/powerpc/mm/book3s64/hash_hugepage.c +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -57,7 +57,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) return 0; - rflags = htab_convert_pte_flags(new_pmd); + rflags = htab_convert_pte_flags(new_pmd, flags); #if 0 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index d2d8237ea9d5..430d1d935a7c 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -14,11 +14,11 @@ #include <linux/processor.h> #include <linux/threads.h> #include <linux/smp.h> +#include <linux/pgtable.h> #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/trace.h> #include <asm/tlb.h> #include <asm/cputable.h> @@ -43,102 +43,28 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) -{ - unsigned long rb; - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - - asm volatile("tlbiel %0" : : "r" (rb)); -} +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); -/* - * tlbiel instruction for hash, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) +static void acquire_hpte_lock(void) { - unsigned long rb; - unsigned long rs; - unsigned int r = 0; /* hash format */ - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) - : "memory"); + lock_map_acquire(&hpte_lock_map); } - -static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +static void release_hpte_lock(void) { - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa206(set, is); - - asm volatile("ptesync": : :"memory"); + lock_map_release(&hpte_lock_map); } - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +#else +static void acquire_hpte_lock(void) { - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and any caching of partition table - * entries. Then flush the remaining sets of the TLB. Hash mode uses - * partition scoped TLB translations. - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - for (set = 1; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); - - /* - * Now invalidate the process table cache. - * - * From ISA v3.0B p. 1078: - * The following forms are invalid. - * * PRS=1, R=0, and RIC!=2 (The only process-scoped - * HPT caching is of the Process Table.) - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 1); - - asm volatile("ptesync": : :"memory"); - - asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } -void hash__tlbiel_all(unsigned int action) +static void release_hpte_lock(void) { - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) - tlbiel_all_isa206(POWER8_TLB_SETS, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) - tlbiel_all_isa206(POWER7_TLB_SETS, is); - else - WARN(1, "%s called on pre-POWER7 CPU\n", __func__); } +#endif static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) @@ -260,7 +186,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) va |= ssize << 8; sllp = get_sllp_encoding(apsize); va |= sllp << 5; - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; @@ -279,7 +205,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) */ va |= (vpn & 0xfe); va |= 1; /* L */ - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; @@ -303,7 +229,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, asm volatile("ptesync": : :"memory"); if (use_local) { __tlbiel(vpn, psize, apsize, ssize); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } else { __tlbie(vpn, psize, apsize, ssize); fixup_tlbie_vpn(vpn, psize, apsize, ssize); @@ -317,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -331,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -340,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -360,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -383,19 +316,24 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } static long native_hpte_remove(unsigned long hpte_group) { + unsigned long hpte_v, flags; struct hash_pte *hptep; int i; int slot_offset; - unsigned long hpte_v; + + local_irq_save(flags); DBG_LOW(" remove(group=%lx)\n", hpte_group); @@ -420,12 +358,16 @@ static long native_hpte_remove(unsigned long hpte_group) slot_offset &= 0x7; } - if (i == HPTES_PER_GROUP) - return -1; + if (i == HPTES_PER_GROUP) { + i = -1; + goto out; + } /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - +out: + local_irq_restore(flags); return i; } @@ -436,6 +378,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -479,6 +424,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -542,6 +489,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -560,6 +510,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -573,6 +525,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -590,6 +545,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } @@ -614,10 +572,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -677,10 +636,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -780,7 +737,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * TODO: add batching support when enabled. remember, no dynamic memory here, * although there is the control page available... */ -static void native_hpte_clear(void) +static notrace void native_hpte_clear(void) { unsigned long vpn = 0; unsigned long slot, slots; @@ -862,8 +819,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } @@ -879,7 +838,7 @@ static void native_flush_hash_range(unsigned long number, int local) __tlbiel(vpn, psize, psize, ssize); } pte_iterate_hashed_end(); } - asm volatile("ptesync":::"memory"); + ppc_after_tlbiel_barrier(); } else { int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 64733b9cb20a..988948d69bc1 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -8,16 +8,15 @@ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/mm.h> +#include <linux/stop_machine.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/sections.h> #include <asm/mmu.h> #include <asm/tlb.h> +#include <asm/firmware.h> #include <mm/mmu_decl.h> -#define CREATE_TRACE_POINTS #include <trace/events/thp.h> #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) @@ -148,6 +147,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -155,7 +155,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); @@ -213,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr old = be64_to_cpu(old_be); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; @@ -236,7 +237,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * to hugepage, we first clear the pmd, then invalidate all * the PTE entries. The assumption here is that any low level * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a + * will wait on mmap_lock. But we could very well be in a * hash_page with local ptep pointer value. Such a hash page * can result in adding new HPTE entries for normal subpages. * That means we could be modifying the page content as we @@ -250,12 +251,12 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * Now invalidate the hpte entries in the range * covered by pmd. This make sure we take a * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and + * result in a major fault which takes mmap_lock and * hence wait for collapse to complete. Without this * the __collapse_huge_page_copy can result in copying * the old content. */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); return pmd; } @@ -363,17 +364,6 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, * hash fault look at them. */ memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_current_mm_pte variants which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_curren_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); return old_pmd; } @@ -388,7 +378,7 @@ int hash__has_transparent_hugepage(void) if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) return 0; /* - * We need to make sure that we support 16MB hugepage in a segement + * We need to make sure that we support 16MB hugepage in a segment * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE * of 64K. */ @@ -411,10 +401,105 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX + +struct change_memory_parms { + unsigned long start, end, newpp; + unsigned int step, nr_cpus; + atomic_t master_cpu; + atomic_t cpu_counter; +}; + +// We'd rather this was on the stack but it has to be in the RMO +static struct change_memory_parms chmem_parms; + +// And therefore we need a lock to protect it from concurrent use +static DEFINE_MUTEX(chmem_lock); + +static void change_memory_range(unsigned long start, unsigned long end, + unsigned int step, unsigned long newpp) +{ + unsigned long idx; + + pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", + start, end, newpp, step); + + for (idx = start; idx < end; idx += step) + /* Not sure if we can do much with the return value */ + mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, + mmu_kernel_ssize); +} + +static int notrace chmem_secondary_loop(struct change_memory_parms *parms) +{ + unsigned long msr, tmp, flags; + int *p; + + p = &parms->cpu_counter.counter; + + local_irq_save(flags); + hard_irq_disable(); + + asm volatile ( + // Switch to real mode and leave interrupts off + "mfmsr %[msr] ;" + "li %[tmp], %[MSR_IR_DR] ;" + "andc %[tmp], %[msr], %[tmp] ;" + "mtmsrd %[tmp] ;" + + // Tell the master we are in real mode + "1: " + "lwarx %[tmp], 0, %[p] ;" + "addic %[tmp], %[tmp], -1 ;" + "stwcx. %[tmp], 0, %[p] ;" + "bne- 1b ;" + + // Spin until the counter goes to zero + "2: ;" + "lwz %[tmp], 0(%[p]) ;" + "cmpwi %[tmp], 0 ;" + "bne- 2b ;" + + // Switch back to virtual mode + "mtmsrd %[msr] ;" + + : // outputs + [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p) + : // inputs + [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR) + : // clobbers + "cc", "xer" + ); + + local_irq_restore(flags); + + return 0; +} + +static int change_memory_range_fn(void *data) +{ + struct change_memory_parms *parms = data; + + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) + return chmem_secondary_loop(parms); + + // Wait for all but one CPU (this one) to call-in + while (atomic_read(&parms->cpu_counter) > 1) + barrier(); + + change_memory_range(parms->start, parms->end, parms->step, parms->newpp); + + mb(); + + // Signal the other CPUs that we're done + atomic_dec(&parms->cpu_counter); + + return 0; +} + static bool hash__change_memory_range(unsigned long start, unsigned long end, unsigned long newpp) { - unsigned long idx; unsigned int step, shift; shift = mmu_psize_defs[mmu_linear_psize].shift; @@ -426,25 +511,43 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, if (start >= end) return false; - pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", - start, end, newpp, step); + if (firmware_has_feature(FW_FEATURE_LPAR)) { + mutex_lock(&chmem_lock); - for (idx = start; idx < end; idx += step) - /* Not sure if we can do much with the return value */ - mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, - mmu_kernel_ssize); + chmem_parms.start = start; + chmem_parms.end = end; + chmem_parms.step = step; + chmem_parms.newpp = newpp; + atomic_set(&chmem_parms.master_cpu, 0); + + cpus_read_lock(); + + atomic_set(&chmem_parms.cpu_counter, num_online_cpus()); + + // Ensure state is consistent before we call the other CPUs + mb(); + + stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms, + cpu_online_mask); + + cpus_read_unlock(); + mutex_unlock(&chmem_lock); + } else + change_memory_range(start, end, step, newpp); return true; } void hash__mark_rodata_ro(void) { - unsigned long start, end; + unsigned long start, end, pp; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; + + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY); - WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); + WARN_ON(!hash__change_memory_range(start, end, pp)); } void hash__mark_initmem_nx(void) @@ -454,7 +557,7 @@ void hash__mark_initmem_nx(void) start = (unsigned long)__init_begin; end = (unsigned long)__init_end; - pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); WARN_ON(!hash__change_memory_range(start, end, pp)); } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 4a70d8dd39cd..21fcad97ae80 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -21,7 +21,6 @@ #include <linux/mm.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/bug.h> @@ -176,7 +175,6 @@ void hash__tlb_flush(struct mmu_gather *tlb) * from the hash table (and the TLB). But keeps * the linux PTEs intact. * - * @mm : mm_struct of the target address space (generally init_mm) * @start : starting address * @end : ending address (not included in the flush) * @@ -189,17 +187,14 @@ void hash__tlb_flush(struct mmu_gather *tlb) * Because of that usage pattern, it is implemented for small size rather * than speed. */ -void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, - unsigned long end) +void __flush_hash_table_range(unsigned long start, unsigned long end) { - bool is_thp; int hugepage_shift; unsigned long flags; - start = _ALIGN_DOWN(start, PAGE_SIZE); - end = _ALIGN_UP(end, PAGE_SIZE); + start = ALIGN_DOWN(start, PAGE_SIZE); + end = ALIGN(end, PAGE_SIZE); - BUG_ON(!mm->pgd); /* * Note: Normally, we should only ever use a batch within a @@ -212,33 +207,27 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, - &hugepage_shift); + pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; if (ptep == NULL) continue; pte = pte_val(*ptep); - if (is_thp) - trace_hugepage_invalidate(start, pte); if (!(pte & H_PAGE_HASHPTE)) continue; - if (unlikely(is_thp)) - hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); - else - hpte_need_flush(mm, start, ptep, pte, hugepage_shift); + hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) { pte_t *pte; pte_t *start_pte; unsigned long flags; - addr = _ALIGN_DOWN(addr, PMD_SIZE); + addr = ALIGN_DOWN(addr, PMD_SIZE); /* * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work @@ -250,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) local_irq_save(flags); arch_enter_lazy_mmu_mode(); start_pte = pte_offset_map(pmd, addr); + if (!start_pte) + goto out; for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } + pte_unmap(start_pte); +out: arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b30435c7d804..0626a25b0d72 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -7,7 +7,7 @@ * * SMP scalability work: * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM - * + * * Module name: htab.c * * Description: @@ -35,17 +35,20 @@ #include <linux/pkeys.h> #include <linux/hugetlb.h> #include <linux/cpu.h> +#include <linux/pgtable.h> +#include <linux/debugfs.h> +#include <linux/random.h> +#include <linux/elf-randomize.h> +#include <linux/of_fdt.h> -#include <asm/debugfs.h> +#include <asm/interrupt.h> #include <asm/processor.h> -#include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/page.h> #include <asm/types.h> #include <linux/uaccess.h> #include <asm/machdep.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/eeh.h> #include <asm/tlb.h> @@ -66,6 +69,9 @@ #include <mm/mmu_decl.h> +#include "internal.h" + + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -95,8 +101,6 @@ */ static unsigned long _SDR1; -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -EXPORT_SYMBOL_GPL(mmu_psize_defs); u8 hpte_page_sizes[1 << LP_BITS]; EXPORT_SYMBOL_GPL(hpte_page_sizes); @@ -109,9 +113,7 @@ int mmu_linear_psize = MMU_PAGE_4K; EXPORT_SYMBOL_GPL(mmu_linear_psize); int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; -#ifdef CONFIG_SPARSEMEM_VMEMMAP -int mmu_vmemmap_psize = MMU_PAGE_4K; -#endif +EXPORT_SYMBOL_GPL(mmu_vmalloc_psize); int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; EXPORT_SYMBOL_GPL(mmu_kernel_ssize); @@ -121,11 +123,8 @@ EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif -#ifdef CONFIG_DEBUG_PAGEALLOC static u8 *linear_map_hash_slots; static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ struct mmu_hash_ops mmu_hash_ops; EXPORT_SYMBOL(mmu_hash_ops); @@ -170,6 +169,110 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = { }, }; +static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) +{ + unsigned long rb; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + + asm volatile("tlbiel %0" : : "r" (rb)); +} + +/* + * tlbiel instruction for hash, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + unsigned int r = 0; /* hash format */ + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) + : "memory"); +} + + +static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa206(set, is); + + ppc_after_tlbiel_barrier(); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the partition table cache if this is HV mode. + */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_hash_set_isa300(0, is, 0, 2, 0); + + /* + * Now invalidate the process table cache. UPRT=0 HPT modes (what + * current hardware implements) do not use the process table, but + * add the flushes anyway. + * + * From ISA v3.0B p. 1078: + * The following forms are invalid. + * * PRS=1, R=0, and RIC!=2 (The only process-scoped + * HPT caching is of the Process Table.) + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 1); + + /* + * Then flush the sets of the TLB proper. Hash mode uses + * partition scoped TLB translations, which may be flushed + * in !HV mode. + */ + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + + ppc_after_tlbiel_barrier(); + + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +void hash__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + tlbiel_all_isa206(POWER8_TLB_SETS, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) + tlbiel_all_isa206(POWER7_TLB_SETS, is); + else + WARN(1, "%s called on pre-POWER7 CPU\n", __func__); +} + /* * 'R' and 'C' update notes: * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* @@ -183,7 +286,7 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = { * - We make sure R is always set and never lost * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping */ -unsigned long htab_convert_pte_flags(unsigned long pteflags) +unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags) { unsigned long rflags = 0; @@ -207,9 +310,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) else rflags |= 0x3; } + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); } else { if (pteflags & _PAGE_RWX) rflags |= 0x2; + /* + * We should never hit this in normal fault handling because + * a permission check (check_pte_access()) will bubble this + * to higher level linux handler even for PAGE_NONE. + */ + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } @@ -237,7 +347,7 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) */ rflags |= HPTE_R_M; - rflags |= pte_to_hpte_pkey_bits(pteflags); + rflags |= pte_to_hpte_pkey_bits(pteflags, flags); return rflags; } @@ -252,13 +362,17 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, shift = mmu_psize_defs[psize].shift; step = 1 << shift; - prot = htab_convert_pte_flags(prot); + prot = htab_convert_pte_flags(prot, HPTE_USE_KERNEL_KEY); DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", vstart, vend, pstart, prot, psize, ssize); - for (vaddr = vstart, paddr = pstart; vaddr < vend; - vaddr += step, paddr += step) { + /* Carefully map only the possible range */ + vaddr = ALIGN(vstart, step); + paddr = ALIGN(pstart, step); + vend = ALIGN_DOWN(vend, step); + + for (; vaddr < vend; vaddr += step, paddr += step) { unsigned long hash, hpteg; unsigned long vsid = get_kernel_vsid(vaddr, ssize); unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); @@ -298,7 +412,7 @@ repeat: ssize); if (ret == -1) { /* - * Try to to keep bolted entries in primary. + * Try to keep bolted entries in primary. * Remove non bolted entries and try insert again */ ret = mmu_hash_ops.hpte_remove(hpteg); @@ -317,11 +431,9 @@ repeat: break; cond_resched(); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && + if (debug_pagealloc_enabled_or_kfence() && (paddr >> PAGE_SHIFT) < linear_map_hash_count) linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; -#endif /* CONFIG_DEBUG_PAGEALLOC */ } return ret < 0 ? ret : 0; } @@ -329,7 +441,7 @@ repeat: int htab_remove_mapping(unsigned long vstart, unsigned long vend, int psize, int ssize) { - unsigned long vaddr; + unsigned long vaddr, time_limit; unsigned int step, shift; int rc; int ret = 0; @@ -340,8 +452,21 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, if (!mmu_hash_ops.hpte_removebolted) return -ENODEV; - for (vaddr = vstart; vaddr < vend; vaddr += step) { + /* Unmap the full range specificied */ + vaddr = ALIGN_DOWN(vstart, step); + time_limit = jiffies + HZ; + + for (;vaddr < vend; vaddr += step) { rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); + + /* + * For large number of mappings introduce a cond_resched() + * to prevent softlockup warnings. + */ + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } if (rc == -ENOENT) { ret = -ENOENT; continue; @@ -353,7 +478,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, return ret; } -static bool disable_1tb_segments = false; +static bool disable_1tb_segments __ro_after_init; static int __init parse_disable_1tb_segments(char *p) { @@ -362,6 +487,40 @@ static int __init parse_disable_1tb_segments(char *p) } early_param("disable_1tb_segments", parse_disable_1tb_segments); +bool stress_hpt_enabled __initdata; + +static int __init parse_stress_hpt(char *p) +{ + stress_hpt_enabled = true; + return 0; +} +early_param("stress_hpt", parse_stress_hpt); + +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key); + +/* + * per-CPU array allocated if we enable stress_hpt. + */ +#define STRESS_MAX_GROUPS 16 +struct stress_hpt_struct { + unsigned long last_group[STRESS_MAX_GROUPS]; +}; + +static inline int stress_nr_groups(void) +{ + /* + * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries + * to allow practical forward progress. Bare metal returns 1, which + * seems to help uncover more bugs. + */ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return STRESS_MAX_GROUPS; + else + return 1; +} + +static struct stress_hpt_struct *stress_hpt_struct; + static int __init htab_dt_scan_seg_sizes(unsigned long node, const char *uname, int depth, void *data) @@ -541,7 +700,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, } #endif /* CONFIG_HUGETLB_PAGE */ -static void mmu_psize_set_default_penc(void) +static void __init mmu_psize_set_default_penc(void) { int bpsize, apsize; for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) @@ -551,7 +710,7 @@ static void mmu_psize_set_default_penc(void) #ifdef CONFIG_PPC_64K_PAGES -static bool might_have_hea(void) +static bool __init might_have_hea(void) { /* * The HEA ethernet adapter requires awareness of the @@ -593,7 +752,7 @@ static void __init htab_scan_page_sizes(void) } #ifdef CONFIG_HUGETLB_PAGE - if (!hugetlb_disabled) { + if (!hugetlb_disabled && !early_radix_enabled() ) { /* Reserve 16G huge page memory sections for huge pages */ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); } @@ -622,7 +781,7 @@ static void __init htab_scan_page_sizes(void) * low-order N bits as the encoding for the 2^(12+N) byte page size * (if it exists). */ -static void init_hpte_page_sizes(void) +static void __init init_hpte_page_sizes(void) { long int ap, bp; long int shift, penc; @@ -652,14 +811,22 @@ static void init_hpte_page_sizes(void) static void __init htab_init_page_sizes(void) { + bool aligned = true; init_hpte_page_sizes(); - if (!debug_pagealloc_enabled()) { + if (!debug_pagealloc_enabled_or_kfence()) { /* * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && + (unsigned long)_stext % 0x1000000) { + if (mmu_psize_defs[MMU_PAGE_16M].shift) + pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n"); + aligned = false; + } + + if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned) mmu_linear_psize = MMU_PAGE_16M; else if (mmu_psize_defs[MMU_PAGE_1M].shift) mmu_linear_psize = MMU_PAGE_1M; @@ -776,7 +943,7 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int resize_hpt_for_hotplug(unsigned long new_mem_size) +static int resize_hpt_for_hotplug(unsigned long new_mem_size) { unsigned target_hpt_shift; @@ -800,7 +967,8 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size) return 0; } -int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) +int hash__create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot) { int rc; @@ -809,8 +977,10 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid return -1; } + resize_hpt_for_hotplug(memblock_phys_mem_size()); + rc = htab_bolt_mapping(start, end, __pa(start), - pgprot_val(PAGE_KERNEL), mmu_linear_psize, + pgprot_val(prot), mmu_linear_psize, mmu_kernel_ssize); if (rc < 0) { @@ -825,7 +995,10 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end) { int rc = htab_remove_mapping(start, end, mmu_linear_psize, mmu_kernel_ssize); - WARN_ON(rc < 0); + + if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) + pr_warn("Hash collision while resizing HPT\n"); + return rc; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -844,13 +1017,30 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, pr_info("Partition table %p\n", partition_tb); } +void hpt_clear_stress(void); +static struct timer_list stress_hpt_timer; +static void stress_hpt_timer_fn(struct timer_list *timer) +{ + int next_cpu; + + hpt_clear_stress(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + tlbiel_all(); + + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer_on(&stress_hpt_timer, next_cpu); +} + static void __init htab_initialize(void) { unsigned long table; unsigned long pteg_count; unsigned long prot; - unsigned long base = 0, size = 0; - struct memblock_region *reg; + phys_addr_t base = 0, size = 0, end; + u64 i; DBG(" -> htab_initialize()\n"); @@ -860,10 +1050,28 @@ static void __init htab_initialize(void) printk(KERN_INFO "Using 1TB segments\n"); } + if (stress_slb_enabled) + static_branch_enable(&stress_slb_key); + + if (stress_hpt_enabled) { + unsigned long tmp; + static_branch_enable(&stress_hpt_key); + // Too early to use nr_cpu_ids, so use NR_CPUS + tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS, + __alignof__(struct stress_hpt_struct), + 0, MEMBLOCK_ALLOC_ANYWHERE); + memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS); + stress_hpt_struct = __va(tmp); + + timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer(&stress_hpt_timer); + } + /* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages. - */ + */ htab_size_bytes = htab_get_table_size(); pteg_count = htab_size_bytes >> 7; @@ -873,7 +1081,7 @@ static void __init htab_initialize(void) firmware_has_feature(FW_FEATURE_PS3_LV1)) { /* Using a hypervisor which owns the htab */ htab_address = NULL; - _SDR1 = 0; + _SDR1 = 0; #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves @@ -926,8 +1134,7 @@ static void __init htab_initialize(void) prot = pgprot_val(PAGE_KERNEL); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled()) { + if (debug_pagealloc_enabled_or_kfence()) { linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; linear_map_hash_slots = memblock_alloc_try_nid( linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, @@ -936,12 +1143,11 @@ static void __init htab_initialize(void) panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", __func__, linear_map_hash_count, &ppc64_rma_size); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ /* create bolted the linear mapping in the hash table */ - for_each_memblock(memory, reg) { - base = (unsigned long)__va(reg->base); - size = reg->size; + for_each_mem_range(i, &base, &end) { + size = end - base; + base = (unsigned long)__va(base); DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", base, size, prot); @@ -1052,7 +1258,7 @@ void __init hash__early_init_mmu(void) ps3_early_mm_init(); else if (firmware_has_feature(FW_FEATURE_LPAR)) hpte_init_pseries(); - else if (IS_ENABLED(CONFIG_PPC_NATIVE)) + else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE)) hpte_init_native(); if (!mmu_hash_ops.hpte_insert) @@ -1095,6 +1301,11 @@ void hash__early_init_mmu_secondary(void) if (cpu_has_feature(CPU_FTR_ARCH_206) && cpu_has_feature(CPU_FTR_HVMODE)) tlbiel_all(); + +#ifdef CONFIG_PPC_MEM_KEYS + if (mmu_has_feature(MMU_FTR_PKEY)) + mtspr(SPRN_UAMOR, default_uamor); +#endif } #endif /* CONFIG_SMP */ @@ -1103,25 +1314,25 @@ void hash__early_init_mmu_secondary(void) */ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) { - struct page *page; + struct folio *folio; if (!pfn_valid(pte_pfn(pte))) return pp; - page = pte_page(pte); + folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); + if (!test_bit(PG_dcache_clean, &folio->flags) && + !folio_test_reserved(folio)) { + if (trap == INTERRUPT_INST_STORAGE) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } else pp |= HPTE_R_N; } return pp; } -#ifdef CONFIG_PPC_MM_SLICES static unsigned int get_paca_psize(unsigned long addr) { unsigned char *psizes; @@ -1138,12 +1349,6 @@ static unsigned int get_paca_psize(unsigned long addr) return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; } -#else -unsigned int get_paca_psize(unsigned long addr) -{ - return get_paca()->mm_ctx_user_psize; -} -#endif /* * Demote a segment to using 4k pages. @@ -1200,7 +1405,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) spp >>= 30 - 2 * ((ea >> 12) & 0xf); /* - * 0 -> full premission + * 0 -> full permission * 1 -> Read only * 2 -> no access. * We return the flag that need to be cleared. @@ -1256,7 +1461,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, unsigned long flags) { bool is_thp; - enum ctx_state prev_state = exception_enter(); pgd_t *pgdir; unsigned long vsid; pte_t *ptep; @@ -1285,12 +1489,14 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, vsid = get_kernel_vsid(ea, mmu_kernel_ssize); psize = mmu_vmalloc_psize; ssize = mmu_kernel_ssize; + flags |= HPTE_USE_KERNEL_KEY; break; case IO_REGION_ID: vsid = get_kernel_vsid(ea, mmu_kernel_ssize); psize = mmu_io_psize; ssize = mmu_kernel_ssize; + flags |= HPTE_USE_KERNEL_KEY; break; default: /* @@ -1340,8 +1546,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, goto bail; } - /* Add _PAGE_PRESENT to the required access perm */ - access |= _PAGE_PRESENT; + /* + * Add _PAGE_PRESENT to the required access perm. If there are parallel + * updates to the pte that can possibly clear _PAGE_PTE, catch that too. + * + * We can safely use the return pte address in rest of the function + * because we do set H_PAGE_BUSY which prevents further updates to pte + * from generic code. + */ + access |= _PAGE_PRESENT | _PAGE_PTE; /* * Pre-check access permissions (will be re-checked atomically @@ -1449,7 +1662,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, DBG_LOW(" -> rc=%d\n", rc); bail: - exception_exit(prev_state); return rc; } EXPORT_SYMBOL_GPL(hash_page_mm); @@ -1471,16 +1683,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, - unsigned long msr) +DEFINE_INTERRUPT_HANDLER(do_hash_fault) { + unsigned long ea = regs->dar; + unsigned long dsisr = regs->dsisr; unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; - struct mm_struct *mm = current->mm; - unsigned int region_id = get_region_id(ea); + struct mm_struct *mm; + unsigned int region_id; + long err; + if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) { + hash__do_page_fault(regs); + return; + } + + region_id = get_region_id(ea); if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) mm = &init_mm; + else + mm = current->mm; if (dsisr & DSISR_NOHPTE) flags |= HPTE_NOHPTE_UPDATE; @@ -1496,16 +1718,30 @@ int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, * 2) user space access kernel space. */ access |= _PAGE_PRIVILEGED; - if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) + if (user_mode(regs) || (region_id == USER_REGION_ID)) access &= ~_PAGE_PRIVILEGED; - if (trap == 0x400) + if (TRAP(regs) == INTERRUPT_INST_STORAGE) access |= _PAGE_EXEC; - return hash_page_mm(mm, ea, access, trap, flags); + err = hash_page_mm(mm, ea, access, TRAP(regs), flags); + if (unlikely(err < 0)) { + // failed to insert a hash PTE due to an hypervisor error + if (user_mode(regs)) { + if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2) + _exception(SIGSEGV, regs, SEGV_ACCERR, ea); + else + _exception(SIGBUS, regs, BUS_ADRERR, ea); + } else { + bad_page_fault(regs, SIGBUS); + } + err = 0; + + } else if (err) { + hash__do_page_fault(regs); + } } -#ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { int psize = get_slice_psize(mm, ea); @@ -1522,23 +1758,15 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) return true; } -#else -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - return true; -} -#endif -static void hash_preload(struct mm_struct *mm, unsigned long ea, +static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, bool is_exec, unsigned long trap) { - int hugepage_shift; unsigned long vsid; pgd_t *pgdir; - pte_t *ptep; - unsigned long flags; int rc, ssize, update_flags = 0; unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); + unsigned long flags; BUG_ON(get_region_id(ea) != USER_REGION_ID); @@ -1558,32 +1786,34 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return; - /* - * Hash doesn't like irqs. Walking linux page table with irq disabled - * saves us from holding multiple locks. - */ - local_irq_save(flags); - /* - * THP pages use update_mmu_cache_pmd. We don't do - * hash preload there. Hence can ignore THP here - */ - ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); - if (!ptep) - goto out_exit; - - WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here + * Called with PTL held, hence can be sure the value won't change in + * between. */ if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) - goto out_exit; + return; #endif /* CONFIG_PPC_64K_PAGES */ + /* + * __hash_page_* must run with interrupts off, including PMI interrupts + * off, as it sets the H_PAGE_BUSY bit. + * + * It's otherwise possible for perf interrupts to hit at any time and + * may take a hash fault reading the user stack, which could take a + * hash miss and deadlock on the same H_PAGE_BUSY bit. + * + * Interrupts must also be off for the duration of the + * mm_is_thread_local test and update, to prevent preempt running the + * mm on another CPU (XXX: this may be racy vs kthread_use_mm). + */ + powerpc_local_irq_pmu_save(flags); + /* Is that local to this CPU ? */ if (mm_is_thread_local(mm)) update_flags |= HPTE_LOCAL_UPDATE; @@ -1606,8 +1836,8 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, mm_ctx_user_psize(&mm->context), mm_ctx_user_psize(&mm->context), pte_val(*ptep)); -out_exit: - local_irq_restore(flags); + + powerpc_local_irq_pmu_restore(flags); } /* @@ -1618,7 +1848,7 @@ out_exit: * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* @@ -1628,11 +1858,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, unsigned long trap; bool is_exec; - if (radix_enabled()) { - prefetch((void *)address); - return; - } - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) return; @@ -1658,32 +1883,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, return; } - hash_preload(vma->vm_mm, address, is_exec, trap); -} - -#ifdef CONFIG_PPC_MEM_KEYS -/* - * Return the protection key associated with the given address and the - * mm_struct. - */ -u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) -{ - pte_t *ptep; - u16 pkey = 0; - unsigned long flags; - - if (!mm || !mm->pgd) - return 0; - - local_irq_save(flags); - ptep = find_linux_pte(mm->pgd, address, NULL, NULL); - if (ptep) - pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); - local_irq_restore(flags); - - return pkey; + hash_preload(vma->vm_mm, ptep, address, is_exec, trap); } -#endif /* CONFIG_PPC_MEM_KEYS */ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM static inline void tm_flush_hash_page(int local) @@ -1726,10 +1927,6 @@ unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, return gslot; } -/* - * WARNING: This is called from hash_low_64.S, if you change this prototype, - * do not forget to update the assembly call site ! - */ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, unsigned long flags) { @@ -1824,27 +2021,6 @@ void flush_hash_range(unsigned long number, int local) } } -/* - * low_hash_fault is called when we the low level hash code failed - * to instert a PTE due to an hypervisor error - */ -void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) -{ - enum ctx_state prev_state = exception_enter(); - - if (user_mode(regs)) { -#ifdef CONFIG_PPC_SUBPAGE_PROT - if (rc == -2) - _exception(SIGSEGV, regs, SEGV_ACCERR, address); - else -#endif - _exception(SIGBUS, regs, BUS_ADRERR, address); - } else - bad_page_fault(regs, address, SIGBUS); - - exception_exit(prev_state); -} - long hpte_insert_repeating(unsigned long hash, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, int psize, int ssize) @@ -1878,13 +2054,78 @@ repeat: return slot; } -#ifdef CONFIG_DEBUG_PAGEALLOC +void hpt_clear_stress(void) +{ + int cpu = raw_smp_processor_id(); + int g; + + for (g = 0; g < stress_nr_groups(); g++) { + unsigned long last_group; + last_group = stress_hpt_struct[cpu].last_group[g]; + + if (last_group != -1UL) { + int i; + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[g] = -1; + } + } +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group) +{ + unsigned long last_group; + int cpu = raw_smp_processor_id(); + + last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1]; + if (hpte_group == last_group) + return; + + if (last_group != -1UL) { + int i; + /* + * Concurrent CPUs might be inserting into this group, so + * give up after a number of iterations, to prevent a live + * lock. + */ + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1; + } + + if (ea >= PAGE_OFFSET) { + /* + * We would really like to prefetch to get the TLB loaded, then + * remove the PTE before returning from fault interrupt, to + * increase the hash fault rate. + * + * Unfortunately QEMU TCG does not model the TLB in a way that + * makes this possible, and systemsim (mambo) emulator does not + * bring in TLBs with prefetches (although loads/stores do + * work for non-CI PTEs). + * + * So remember this PTE and clear it on the next hash fault. + */ + memmove(&stress_hpt_struct[cpu].last_group[1], + &stress_hpt_struct[cpu].last_group[0], + (stress_nr_groups() - 1) * sizeof(unsigned long)); + stress_hpt_struct[cpu].last_group[0] = hpte_group; + } +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); + static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { unsigned long hash; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); long ret; hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); @@ -1893,15 +2134,18 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) if (!vsid) return; + if (linear_map_hash_slots[lmi] & 0x80) + return; + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, HPTE_V_BOLTED, mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -1911,11 +2155,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); + raw_spin_lock(&linear_map_hash_lock); + if (!(linear_map_hash_slots[lmi] & 0x80)) { + raw_spin_unlock(&linear_map_hash_lock); + return; + } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -1925,7 +2172,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) mmu_kernel_ssize, 0); } -void __kernel_map_pages(struct page *page, int numpages, int enable) +void hash__kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long flags, vaddr, lmi; int i; @@ -1943,7 +2190,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } local_irq_restore(flags); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) @@ -2009,11 +2256,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n") static int __init hash64_debugfs(void) { - if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, - NULL, &fops_hpt_order)) { - pr_err("lpar: unable to create hpt_order debugsfs file\n"); - } - + debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, + &fops_hpt_order); return 0; } machine_device_initcall(pseries, hash64_debugfs); @@ -2026,3 +2270,20 @@ void __init print_system_hash_info(void) if (htab_hash_mask) pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* + * If we are using 1TB segments and we are allowed to randomise + * the heap, we can put it above 1TB so it is backed by a 1TB + * segment. Otherwise the heap will be in the bottom 1TB + * which always uses 256MB segments and this may result in a + * performance penalty. + */ + if (is_32bit_task()) + return randomize_page(mm->brk, SZ_32M); + else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T) + return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G); + else + return randomize_page(mm->brk, SZ_1G); +} diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index eefa89c6117b..5a2e512e96db 100644 --- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -10,18 +10,13 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/machdep.h> unsigned int hpage_shift; EXPORT_SYMBOL(hpage_shift); -extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, - unsigned long pa, unsigned long rlags, - unsigned long vflags, int psize, int ssize); - +#ifdef CONFIG_PPC_64S_HASH_MMU int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize, unsigned int shift, unsigned int mmu_psize) @@ -72,7 +67,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) return 0; - rflags = htab_convert_pte_flags(new_pte); + rflags = htab_convert_pte_flags(new_pte, flags); if (unlikely(mmu_psize == MMU_PAGE_16G)) offset = PTRS_PER_PUD; else @@ -128,6 +123,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } +#endif pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -147,14 +143,17 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { + unsigned long psize; if (radix_enabled()) return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + + psize = huge_page_size(hstate_vma(vma)); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } -void hugetlbpage_init_default(void) +void __init hugetlbpage_init_defaultsize(void) { /* Set default large page size. Currently, we pick 16M or 1M * depending on what is available diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h new file mode 100644 index 000000000000..a57a25f06a21 --- /dev/null +++ b/arch/powerpc/mm/book3s64/internal.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H +#define ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H + +#include <linux/jump_label.h> + +extern bool stress_slb_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_slb_key); + +static inline bool stress_slb(void) +{ + return static_branch_unlikely(&stress_slb_key); +} + +extern bool stress_hpt_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_hpt_key); + +static inline bool stress_hpt(void) +{ + return static_branch_unlikely(&stress_hpt_key); +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group); + +void slb_setup_new_exec(void); + +void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush); + +#endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */ diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 56cc84520577..c0e8d597e4cb 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -96,16 +96,16 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, goto unlock_exit; } - down_read(&mm->mmap_sem); - chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + mmap_read_lock(mm); + chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); for (entry = 0; entry < entries; entry += chunk) { unsigned long n = min(entries - entry, chunk); - ret = get_user_pages(ua + (entry << PAGE_SHIFT), n, + ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n, FOLL_WRITE | FOLL_LONGTERM, - mem->hpages + entry, NULL); + mem->hpages + entry); if (ret == n) { pinned += n; continue; @@ -114,31 +114,13 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, pinned += ret; break; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (pinned != entries) { if (!ret) ret = -EFAULT; goto free_exit; } - pageshift = PAGE_SHIFT; - for (i = 0; i < entries; ++i) { - struct page *page = mem->hpages[i]; - - /* - * Allow to use larger than 64k IOMMU pages. Only do that - * if we are backed by hugetlb. - */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) - pageshift = page_shift(compound_head(page)); - mem->pageshift = min(mem->pageshift, pageshift); - /* - * We don't need struct page reference any more, switch - * to physical address. - */ - mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; - } - good_exit: atomic64_set(&mem->mapped, 1); mem->used = 1; @@ -147,7 +129,8 @@ good_exit: mutex_lock(&mem_list_mutex); - list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) { + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next, + lockdep_is_held(&mem_list_mutex)) { /* Overlap? */ if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && (ua < (mem2->ua + @@ -158,6 +141,27 @@ good_exit: } } + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { + /* + * Allow to use larger than 64k IOMMU pages. Only do that + * if we are backed by hugetlb. Skip device memory as it is not + * backed with page structs. + */ + pageshift = PAGE_SHIFT; + for (i = 0; i < entries; ++i) { + struct page *page = mem->hpages[i]; + + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); + mem->pageshift = min(mem->pageshift, pageshift); + /* + * We don't need struct page reference any more, switch + * to physical address. + */ + mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; + } + } + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); mutex_unlock(&mem_list_mutex); @@ -167,9 +171,8 @@ good_exit: return 0; free_exit: - /* free the reference taken */ - for (i = 0; i < pinned; i++) - put_page(mem->hpages[i]); + /* free the references taken */ + unpin_user_pages(mem->hpages, pinned); vfree(mem->hpas); kfree(mem); @@ -215,7 +218,8 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) SetPageDirty(page); - put_page(page); + unpin_user_page(page); + mem->hpas[i] = 0; } } @@ -260,7 +264,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) goto unlock_exit; /* Are there still mappings? */ - if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) { + if (atomic64_cmpxchg(&mem->mapped, 1, 0) != 1) { ++mem->used; ret = -EBUSY; goto unlock_exit; @@ -286,6 +290,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, { struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + rcu_read_lock(); list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { if ((mem->ua <= ua) && (ua + size <= mem->ua + @@ -294,29 +299,12 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, break; } } + rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(mm_iommu_lookup); -struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, - next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} - struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries) { @@ -324,7 +312,8 @@ struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, mutex_lock(&mem_list_mutex); - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next, + lockdep_is_held(&mem_list_mutex)) { if ((mem->ua == ua) && (mem->entries == entries)) { ret = mem; ++mem->used; @@ -362,62 +351,13 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, } EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); -long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - unsigned long *pa; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); - if (!pa) - return -EFAULT; - - *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} - -extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) -{ - struct mm_iommu_table_group_mem_t *mem; - long entry; - void *va; - unsigned long *pa; - - mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); - if (!mem) - return; - - if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) - return; - - entry = (ua - mem->ua) >> PAGE_SHIFT; - va = &mem->hpas[entry]; - - pa = (void *) vmalloc_to_phys(va); - if (!pa) - return; - - *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; -} - bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, unsigned int pageshift, unsigned long *size) { struct mm_iommu_table_group_mem_t *mem; unsigned long end; + rcu_read_lock(); list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) continue; @@ -434,6 +374,7 @@ bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, return true; } } + rcu_read_unlock(); return false; } diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 0ba30b8b935b..1715b07c630c 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -17,10 +17,13 @@ #include <linux/export.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/cpu.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> +#include "internal.h" + static DEFINE_IDA(mmu_context_ida); static int alloc_context_id(int min_id, int max_id) @@ -28,7 +31,8 @@ static int alloc_context_id(int min_id, int max_id) return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); } -void hash__reserve_context_id(int id) +#ifdef CONFIG_PPC_64S_HASH_MMU +void __init hash__reserve_context_id(int id) { int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); @@ -47,9 +51,9 @@ int hash__alloc_context_id(void) return alloc_context_id(MIN_USER_CONTEXT, max); } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +#endif -void slb_setup_new_exec(void); - +#ifdef CONFIG_PPC_64S_HASH_MMU static int realloc_context_ids(mm_context_t *ctx) { int i, id; @@ -118,7 +122,7 @@ static int hash__init_new_context(struct mm_struct *mm) /* This is fork. Copy hash_context details from current->mm */ memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); #ifdef CONFIG_PPC_SUBPAGE_PROT - /* inherit subpage prot detalis if we have one. */ + /* inherit subpage prot details if we have one. */ if (current->mm->context.hash_context->spt) { mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); @@ -149,6 +153,13 @@ void hash__setup_new_exec(void) slb_setup_new_exec(); } +#else +static inline int hash__init_new_context(struct mm_struct *mm) +{ + BUILD_BUG(); + return 0; +} +#endif static int radix__init_new_context(struct mm_struct *mm) { @@ -174,7 +185,9 @@ static int radix__init_new_context(struct mm_struct *mm) */ asm volatile("ptesync;isync" : : : "memory"); +#ifdef CONFIG_PPC_64S_HASH_MMU mm->context.hash_context = NULL; +#endif return index; } @@ -212,28 +225,36 @@ EXPORT_SYMBOL_GPL(__destroy_context); static void destroy_contexts(mm_context_t *ctx) { - int index, context_id; + if (radix_enabled()) { + ida_free(&mmu_context_ida, ctx->id); + } else { +#ifdef CONFIG_PPC_64S_HASH_MMU + int index, context_id; - for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { - context_id = ctx->extended_id[index]; - if (context_id) - ida_free(&mmu_context_ida, context_id); + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } + kfree(ctx->hash_context); +#else + BUILD_BUG(); // radix_enabled() should be constant true +#endif } - kfree(ctx->hash_context); } static void pmd_frag_destroy(void *pmd_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pmd_frag); + ptdesc = virt_to_ptdesc(pmd_frag); /* drop all the pending references */ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -307,3 +328,22 @@ void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) isync(); } #endif + +/** + * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined) + * + * This clears the CPU from mm_cpumask for all processes, and then flushes the + * local TLB to ensure TLB coherency in case the CPU is onlined again. + * + * KVM guest translations are not necessarily flushed here. If KVM started + * using mm_cpumask or the Linux APIs which do, this would have to be resolved. + */ +#ifdef CONFIG_HOTPLUG_CPU +void cleanup_cpu_mmu_context(void) +{ + int cpu = smp_processor_id(); + + clear_tasks_mm_cpumask(cpu); + tlbiel_all(); +} +#endif diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 75483b40fcb1..3438ab72c346 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,19 +6,32 @@ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/memblock.h> +#include <linux/memremap.h> +#include <linux/pkeys.h> +#include <linux/debugfs.h> +#include <linux/proc_fs.h> #include <misc/cxl-base.h> -#include <asm/debugfs.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/trace.h> #include <asm/powernv.h> #include <asm/firmware.h> #include <asm/ultravisor.h> +#include <asm/kexec.h> #include <mm/mmu_decl.h> #include <trace/events/thp.h> +#include "internal.h" + +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif + unsigned long __pmd_frag_nr; EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; @@ -52,11 +65,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, return changed; } +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_1G here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -78,24 +119,46 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } -static void do_nothing(void *unused) +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) { +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_large(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); } + +static void do_serialize(void *arg) +{ + /* We've taken the IPI, so try to trim the mask while here */ + if (radix_enabled()) { + struct mm_struct *mm = arg; + exit_lazy_flush_tlb(mm, false); + } +} + /* - * Serialize against find_current_mm_pte which does lock-less + * Serialize against __find_linux_pte() which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. + * __find_linux_pte() to finish. */ void serialize_against_pte_lookup(struct mm_struct *mm) { smp_mb(); - smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); + smp_call_function_many(mm_cpumask(mm), do_serialize, mm, 1); } /* @@ -109,15 +172,44 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return __pmd(old_pmd); +} + +pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, int full) +{ + pmd_t pmd; + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - * - * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires - * a special case check in pmd_access_permitted. + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. */ - serialize_against_pte_lookup(vma->vm_mm); - return __pmd(old_pmd); + if (!full) + flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + return pmd; +} + +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || + !pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) @@ -125,12 +217,32 @@ static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); } +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + +/* + * At some point we should be able to get rid of + * pmd_mkhuge() and mk_huge_pmd() when we update all the + * other archs to mark the pmd huge in pfn_pmd() + */ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) { unsigned long pmdv; pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); + + return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); +} + +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); } pmd_t mk_pmd(struct page *page, pgprot_t pgprot) @@ -146,37 +258,27 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmdv &= _HPAGE_CHG_MASK; return pmd_set_protbits(__pmd(pmdv), newprot); } - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - if (radix_enabled()) - prefetch((void *)addr); -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* For use by kexec */ -void mmu_cleanup_all(void) +/* For use by kexec, called with MMU off */ +notrace void mmu_cleanup_all(void) { if (radix_enabled()) radix__mmu_cleanup_all(); else if (mmu_hash_ops.hpte_clear_all) mmu_hash_ops.hpte_clear_all(); + + reset_sprs(); } #ifdef CONFIG_MEMORY_HOTPLUG -int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid) +int __meminit create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot) { if (radix_enabled()) - return radix__create_section_mapping(start, end, nid); + return radix__create_section_mapping(start, end, nid, prot); - return hash__create_section_mapping(start, end, nid); + return hash__create_section_mapping(start, end, nid, prot); } int __meminit remove_section_mapping(unsigned long start, unsigned long end) @@ -193,17 +295,12 @@ void __init mmu_partition_table_init(void) unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; unsigned long ptcr; - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); /* Initialize the Partition Table with no entries */ partition_tb = memblock_alloc(patb_size, patb_size); if (!partition_tb) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, patb_size, patb_size); - /* - * update partition table control register, - * 64 K size. - */ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); @@ -288,22 +385,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm) static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -313,12 +410,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return - * the allocated page with single fragement + * If we find ptdesc_page set, we return + * the allocated page with single fragment * count. */ if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); mm->context.pmd_frag = ret + PMD_FRAG_SIZE; } spin_unlock(&mm->page_table_lock); @@ -339,12 +436,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) void pmd_fragment_free(unsigned long *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); + + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -358,7 +458,7 @@ static inline void pgtable_free(void *table, int index) pmd_fragment_free(table); break; case PUD_INDEX: - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); + __pud_free(table); break; #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) /* 16M hugepd directory at pud level */ @@ -378,7 +478,6 @@ static inline void pgtable_free(void *table, int index) } } -#ifdef CONFIG_SMP void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) { unsigned long pgf = (unsigned long)table; @@ -395,12 +494,6 @@ void __tlb_remove_table(void *_table) return pgtable_free(table, index); } -#else -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) -{ - return pgtable_free(table, index); -} -#endif #ifdef CONFIG_PROC_FS atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; @@ -449,6 +542,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, set_pte_at(vma->vm_mm, addr, ptep, pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd @@ -470,6 +564,7 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } +#endif /* * Does the CPU support tlbie? @@ -510,9 +605,50 @@ static int __init pgtable_debugfs_setup(void) * invalidated as expected. */ debugfs_create_bool("tlbie_enabled", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &tlbie_enabled); return 0; } arch_initcall(pgtable_debugfs_setup); + +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN) +/* + * Override the generic version in mm/memremap.c. + * + * With hash translation, the direct-map range is mapped with just one + * page size selected by htab_init_page_sizes(). Consult + * mmu_psize_defs[] to determine the minimum page size alignment. +*/ +unsigned long memremap_compat_align(void) +{ + if (!radix_enabled()) { + unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; + return max(SUBSECTION_SIZE, 1UL << shift); + } + + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long prot; + + /* Radix supports execute-only, but protection_map maps X -> RX */ + if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) + vm_flags |= VM_READ; + + prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]); + + if (vm_flags & VM_SAO) + prot |= _PAGE_SAO; + +#ifdef CONFIG_PPC_MEM_KEYS + prot |= vmflag_to_pte_pkey_bits(vm_flags); +#endif + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index 59e0ebbd8036..a974baf8f327 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -9,60 +9,108 @@ #include <asm/mmu_context.h> #include <asm/mmu.h> #include <asm/setup.h> +#include <asm/smp.h> +#include <asm/firmware.h> + #include <linux/pkeys.h> -#include <linux/of_device.h> +#include <linux/of_fdt.h> -DEFINE_STATIC_KEY_TRUE(pkey_disabled); -int pkeys_total; /* Total pkeys as per device tree */ -u32 initial_allocation_mask; /* Bits set for the initially allocated keys */ -u32 reserved_allocation_mask; /* Bits set for reserved keys */ -static bool pkey_execute_disable_supported; -static bool pkeys_devtree_defined; /* property exported by device tree */ -static u64 pkey_amr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */ + +int num_pkey; /* Max number of pkeys supported */ +/* + * Keys marked in the reservation list cannot be allocated by userspace + */ +u32 reserved_allocation_mask __ro_after_init; + +/* Bits set for the initially allocated keys */ +static u32 initial_allocation_mask __ro_after_init; + +/* + * Even if we allocate keys with sys_pkey_alloc(), we need to make sure + * other thread still find the access denied using the same keys. + */ +u64 default_amr __ro_after_init = ~0x0UL; +u64 default_iamr __ro_after_init = 0x5555555555555555UL; +u64 default_uamor __ro_after_init; +EXPORT_SYMBOL(default_amr); +/* + * Key used to implement PROT_EXEC mmap. Denies READ/WRITE + * We pick key 2 because 0 is special key and 1 is reserved as per ISA. + */ static int execute_only_key = 2; +static bool pkey_execute_disable_supported; + #define AMR_BITS_PER_PKEY 2 #define AMR_RD_BIT 0x1UL #define AMR_WR_BIT 0x2UL #define IAMR_EX_BIT 0x1UL -#define PKEY_REG_BITS (sizeof(u64)*8) +#define PKEY_REG_BITS (sizeof(u64) * 8) #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) -static void scan_pkey_feature(void) +static int __init dt_scan_storage_keys(unsigned long node, + const char *uname, int depth, + void *data) { - u32 vals[2]; - struct device_node *cpu; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int *pkeys_total = (int *) data; - cpu = of_find_node_by_type(NULL, "cpu"); - if (!cpu) - return; + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; - if (of_property_read_u32_array(cpu, - "ibm,processor-storage-keys", vals, 2)) - return; + prop = of_get_flat_dt_prop(node, "ibm,processor-storage-keys", NULL); + if (!prop) + return 0; + *pkeys_total = be32_to_cpu(prop[0]); + return 1; +} + +static int __init scan_pkey_feature(void) +{ + int ret; + int pkeys_total = 0; /* - * Since any pkey can be used for data or execute, we will just treat - * all keys as equal and track them as one entity. + * Pkey is not supported with Radix translation. */ - pkeys_total = vals[0]; - pkeys_devtree_defined = true; -} + if (early_radix_enabled()) + return 0; -static inline bool pkey_mmu_enabled(void) -{ - if (firmware_has_feature(FW_FEATURE_LPAR)) - return pkeys_total; - else - return cpu_has_feature(CPU_FTR_PKEY); + ret = of_scan_flat_dt(dt_scan_storage_keys, &pkeys_total); + if (ret == 0) { + /* + * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device + * tree. We make this exception since some version of skiboot forgot to + * expose this property on power8/9. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + unsigned long pvr = mfspr(SPRN_PVR); + + if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || + PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 || + PVR_VER(pvr) == PVR_HX_C2000) + pkeys_total = 32; + } + } + +#ifdef CONFIG_PPC_MEM_KEYS + /* + * Adjust the upper limit, based on the number of bits supported by + * arch-neutral code. + */ + pkeys_total = min_t(int, pkeys_total, + ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + 1)); +#endif + return pkeys_total; } -static int pkey_initialize(void) +void __init pkey_early_init_devtree(void) { - int os_reserved, i; + int pkeys_total, i; +#ifdef CONFIG_PPC_MEM_KEYS /* * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE. @@ -78,33 +126,22 @@ static int pkey_initialize(void) BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) != (sizeof(u64) * BITS_PER_BYTE)); - - /* scan the device tree for pkey feature */ - scan_pkey_feature(); - +#endif /* - * Let's assume 32 pkeys on P8 bare metal, if its not defined by device - * tree. We make this exception since skiboot forgot to expose this - * property on power8. + * Only P7 and above supports SPRN_AMR update with MSR[PR] = 1 */ - if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) && - cpu_has_feature(CPU_FTRS_POWER8)) - pkeys_total = 32; + if (!early_cpu_has_feature(CPU_FTR_ARCH_206)) + return; - /* - * Adjust the upper limit, based on the number of bits supported by - * arch-neutral code. - */ - pkeys_total = min_t(int, pkeys_total, - ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1)); + /* scan the device tree for pkey feature */ + pkeys_total = scan_pkey_feature(); + if (!pkeys_total) + goto out; - if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total) - static_branch_enable(&pkey_disabled); - else - static_branch_disable(&pkey_disabled); + /* Allow all keys to be modified by default */ + default_uamor = ~0x0UL; - if (static_branch_likely(&pkey_disabled)) - return 0; + cur_cpu_spec->mmu_features |= MMU_FTR_PKEY; /* * The device tree cannot be relied to indicate support for @@ -118,122 +155,180 @@ static int pkey_initialize(void) #ifdef CONFIG_PPC_4K_PAGES /* * The OS can manage only 8 pkeys due to its inability to represent them - * in the Linux 4K PTE. + * in the Linux 4K PTE. Mark all other keys reserved. */ - os_reserved = pkeys_total - 8; + num_pkey = min(8, pkeys_total); #else - os_reserved = 0; + num_pkey = pkeys_total; #endif - /* Bits are in LE format. */ - reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key); - - /* register mask is in BE format */ - pkey_amr_mask = ~0x0ul; - pkey_amr_mask &= ~(0x3ul << pkeyshift(0)); - pkey_iamr_mask = ~0x0ul; - pkey_iamr_mask &= ~(0x3ul << pkeyshift(0)); - pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key)); + if (unlikely(num_pkey <= execute_only_key) || !pkey_execute_disable_supported) { + /* + * Insufficient number of keys to support + * execute only key. Mark it unavailable. + */ + execute_only_key = -1; + } else { + /* + * Mark the execute_only_pkey as not available for + * user allocation via pkey_alloc. + */ + reserved_allocation_mask |= (0x1 << execute_only_key); - pkey_uamor_mask = ~0x0ul; - pkey_uamor_mask &= ~(0x3ul << pkeyshift(0)); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key)); + /* + * Deny READ/WRITE for execute_only_key. + * Allow execute in IAMR. + */ + default_amr |= (0x3ul << pkeyshift(execute_only_key)); + default_iamr &= ~(0x1ul << pkeyshift(execute_only_key)); - /* mark the rest of the keys as reserved and hence unavailable */ - for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) { - reserved_allocation_mask |= (0x1 << i); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(i)); + /* + * Clear the uamor bits for this key. + */ + default_uamor &= ~(0x3ul << pkeyshift(execute_only_key)); } - initial_allocation_mask = reserved_allocation_mask | (0x1 << 0); - if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) { + if (unlikely(num_pkey <= 3)) { /* * Insufficient number of keys to support - * execute only key. Mark it unavailable. - * Any AMR, UAMOR, IAMR bit set for - * this key is irrelevant since this key - * can never be allocated. + * KUAP/KUEP feature. */ - execute_only_key = -1; + disable_kuep = true; + disable_kuap = true; + WARN(1, "Disabling kernel user protection due to low (%d) max supported keys\n", num_pkey); + } else { + /* handle key which is used by kernel for KAUP */ + reserved_allocation_mask |= (0x1 << 3); + /* + * Mark access for kup_key in default amr so that + * we continue to operate with that AMR in + * copy_to/from_user(). + */ + default_amr &= ~(0x3ul << pkeyshift(3)); + default_iamr &= ~(0x1ul << pkeyshift(3)); + default_uamor &= ~(0x3ul << pkeyshift(3)); } - return 0; -} - -arch_initcall(pkey_initialize); - -void pkey_mm_init(struct mm_struct *mm) -{ - if (static_branch_likely(&pkey_disabled)) - return; - mm_pkey_allocation_map(mm) = initial_allocation_mask; - mm->context.execute_only_pkey = execute_only_key; -} + /* + * Allow access for only key 0. And prevent any other modification. + */ + default_amr &= ~(0x3ul << pkeyshift(0)); + default_iamr &= ~(0x1ul << pkeyshift(0)); + default_uamor &= ~(0x3ul << pkeyshift(0)); + /* + * key 0 is special in that we want to consider it an allocated + * key which is preallocated. We don't allow changing AMR bits + * w.r.t key 0. But one can pkey_free(key0) + */ + initial_allocation_mask |= (0x1 << 0); -static inline u64 read_amr(void) -{ - return mfspr(SPRN_AMR); -} + /* + * key 1 is recommended not to be used. PowerISA(3.0) page 1015, + * programming note. + */ + reserved_allocation_mask |= (0x1 << 1); + default_uamor &= ~(0x3ul << pkeyshift(1)); -static inline void write_amr(u64 value) -{ - mtspr(SPRN_AMR, value); -} + /* + * Prevent the usage of OS reserved keys. Update UAMOR + * for those keys. Also mark the rest of the bits in the + * 32 bit mask as reserved. + */ + for (i = num_pkey; i < 32 ; i++) { + reserved_allocation_mask |= (0x1 << i); + default_uamor &= ~(0x3ul << pkeyshift(i)); + } + /* + * Prevent the allocation of reserved keys too. + */ + initial_allocation_mask |= reserved_allocation_mask; -static inline u64 read_iamr(void) -{ - if (!likely(pkey_execute_disable_supported)) - return 0x0UL; + pr_info("Enabling pkeys with max key count %d\n", num_pkey); +out: + /* + * Setup uamor on boot cpu + */ + mtspr(SPRN_UAMOR, default_uamor); - return mfspr(SPRN_IAMR); + return; } -static inline void write_iamr(u64 value) +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled) { - if (!likely(pkey_execute_disable_supported)) + if (disabled) + return; + /* + * On hash if PKEY feature is not enabled, disable KUAP too. + */ + if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY)) return; - mtspr(SPRN_IAMR, value); -} + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Execution Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUEP; + } -static inline u64 read_uamor(void) -{ - return mfspr(SPRN_UAMOR); + /* + * Radix always uses key0 of the IAMR to determine if an access is + * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction + * fetch. + */ + mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED); + isync(); } +#endif -static inline void write_uamor(u64 value) +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) { - mtspr(SPRN_UAMOR, value); -} + if (disabled) + return; + /* + * On hash if PKEY feature is not enabled, disable KUAP too. + */ + if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY)) + return; -static bool is_pkey_enabled(int pkey) -{ - u64 uamor = read_uamor(); - u64 pkey_bits = 0x3ul << pkeyshift(pkey); - u64 uamor_pkey_bits = (uamor & pkey_bits); + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Access Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_KUAP; + } /* - * Both the bits in UAMOR corresponding to the key should be set or - * reset. + * Set the default kernel AMR values on all cpus. */ - WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits)); - return !!(uamor_pkey_bits); + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + isync(); +} +#endif + +#ifdef CONFIG_PPC_MEM_KEYS +void pkey_mm_init(struct mm_struct *mm) +{ + if (!mmu_has_feature(MMU_FTR_PKEY)) + return; + mm_pkey_allocation_map(mm) = initial_allocation_mask; + mm->context.execute_only_pkey = execute_only_key; } static inline void init_amr(int pkey, u8 init_bits) { u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); - u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); + u64 old_amr = current_thread_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); - write_amr(old_amr | new_amr_bits); + current->thread.regs->amr = old_amr | new_amr_bits; } static inline void init_iamr(int pkey, u8 init_bits) { u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey)); - u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); + u64 old_iamr = current_thread_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); - write_iamr(old_iamr | new_iamr_bits); + if (!likely(pkey_execute_disable_supported)) + return; + + current->thread.regs->iamr = old_iamr | new_iamr_bits; } /* @@ -245,8 +340,18 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, { u64 new_amr_bits = 0x0ul; u64 new_iamr_bits = 0x0ul; + u64 pkey_bits, uamor_pkey_bits; + + /* + * Check whether the key is disabled by UAMOR. + */ + pkey_bits = 0x3ul << pkeyshift(pkey); + uamor_pkey_bits = (default_uamor & pkey_bits); - if (!is_pkey_enabled(pkey)) + /* + * Both the bits in UAMOR corresponding to the key should be set + */ + if (uamor_pkey_bits != pkey_bits) return -EINVAL; if (init_val & PKEY_DISABLE_EXECUTE) { @@ -266,48 +371,7 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, return 0; } -void thread_pkey_regs_save(struct thread_struct *thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - /* - * TODO: Skip saving registers if @thread hasn't used any keys yet. - */ - thread->amr = read_amr(); - thread->iamr = read_iamr(); - thread->uamor = read_uamor(); -} - -void thread_pkey_regs_restore(struct thread_struct *new_thread, - struct thread_struct *old_thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - if (old_thread->amr != new_thread->amr) - write_amr(new_thread->amr); - if (old_thread->iamr != new_thread->iamr) - write_iamr(new_thread->iamr); - if (old_thread->uamor != new_thread->uamor) - write_uamor(new_thread->uamor); -} - -void thread_pkey_regs_init(struct thread_struct *thread) -{ - if (static_branch_likely(&pkey_disabled)) - return; - - thread->amr = pkey_amr_mask; - thread->iamr = pkey_iamr_mask; - thread->uamor = pkey_uamor_mask; - - write_uamor(pkey_uamor_mask); - write_amr(pkey_amr_mask); - write_iamr(pkey_iamr_mask); -} - -int __execute_only_pkey(struct mm_struct *mm) +int execute_only_pkey(struct mm_struct *mm) { return mm->context.execute_only_pkey; } @@ -315,7 +379,7 @@ int __execute_only_pkey(struct mm_struct *mm) static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) { /* Do this check first since the vm_flags should be hot */ - if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) + if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC) return false; return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey); @@ -353,21 +417,20 @@ static bool pkey_access_permitted(int pkey, bool write, bool execute) int pkey_shift; u64 amr; - if (!is_pkey_enabled(pkey)) - return true; - pkey_shift = pkeyshift(pkey); - if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift))) - return true; + if (execute) + return !(current_thread_iamr() & (IAMR_EX_BIT << pkey_shift)); - amr = read_amr(); /* Delay reading amr until absolutely needed */ - return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) || - (write && !(amr & (AMR_WR_BIT << pkey_shift)))); + amr = current_thread_amr(); + if (write) + return !(amr & (AMR_WR_BIT << pkey_shift)); + + return !(amr & (AMR_RD_BIT << pkey_shift)); } bool arch_pte_access_permitted(u64 pte, bool write, bool execute) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return true; return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute); @@ -381,22 +444,10 @@ bool arch_pte_access_permitted(u64 pte, bool write, bool execute) * So do not enforce things if the VMA is not from the current mm, or if we are * in a kernel thread. */ -static inline bool vma_is_foreign(struct vm_area_struct *vma) -{ - if (!current->mm) - return true; - - /* if it is not our ->mm, it has to be foreign */ - if (current->mm != vma->vm_mm) - return true; - - return false; -} - bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return true; /* * Do not enforce our key-permissions on a foreign vma. @@ -409,10 +460,12 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return; /* Duplicate the oldmm pkey state in mm: */ mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm); mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; } + +#endif /* CONFIG_PPC_MEM_KEYS */ diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index cab06331c0c0..35fd2a95be24 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -2,8 +2,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/security.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/machdep.h> #include <asm/mman.h> @@ -34,62 +32,14 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st struct hstate *hstate = hstate_file(vma->vm_file); psize = hstate_get_psize(hstate); - radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); -} - -/* - * A vairant of hugetlb_get_unmapped_area doing topdown search - * FIXME!! should we do as x86 does or non hugetlb area does ? - * ie, use topdown or not based on mmap_is_legacy check ? - */ -unsigned long -radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct hstate *h = hstate_file(file); - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } /* - * We are always doing an topdown search here. Slice code - * does that too. + * Flush PWC even if we get PUD_SIZE hugetlb invalidate to keep this simpler. */ - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - - return vm_unmapped_area(&info); + if (end - start >= PUD_SIZE) + radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); + else + radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, @@ -97,14 +47,17 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, pte_t old_pte, pte_t pte) { struct mm_struct *mm = vma->vm_mm; + unsigned long psize = huge_page_size(hstate_vma(vma)); /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + atomic_read(&mm->context.copros) > 0) radix__flush_hugetlb_page(vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 974109bb85db..c6a4ac766b2b 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -11,13 +11,13 @@ #include <linux/kernel.h> #include <linux/sched/mm.h> #include <linux/memblock.h> +#include <linux/of.h> #include <linux/of_fdt.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/string_helpers.h> -#include <linux/stop_machine.h> +#include <linux/memory.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/mmu_context.h> #include <asm/dma.h> @@ -26,13 +26,16 @@ #include <asm/firmware.h> #include <asm/powernv.h> #include <asm/sections.h> +#include <asm/smp.h> #include <asm/trace.h> #include <asm/uaccess.h> #include <asm/ultravisor.h> +#include <asm/set_memory.h> #include <trace/events/thp.h> -unsigned int mmu_pid_bits; +#include <mm/mmu_decl.h> + unsigned int mmu_base_pid; static __ref void *early_alloc_pgtable(unsigned long size, int nid, @@ -56,6 +59,13 @@ static __ref void *early_alloc_pgtable(unsigned long size, int nid, return ptr; } +/* + * When allocating pud or pmd pointers, we allocate a complete page + * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This + * is to ensure that the page obtained from the memblock allocator + * can be completely used as page table page and can be freed + * correctly when the page table entries are removed. + */ static int early_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, unsigned int map_page_size, @@ -64,24 +74,26 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, - region_start, region_end); - pgd_populate(&init_mm, pgdp, pudp); + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { + pudp = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); + p4d_populate(&init_mm, p4dp, pudp); } - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (map_page_size == PUD_SIZE) { ptep = (pte_t *)pudp; goto set_the_pte; } if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, - region_start, region_end); + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, + region_end); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); @@ -98,7 +110,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, set_the_pte: set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); + asm volatile("ptesync": : :"memory"); return 0; } @@ -114,6 +126,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -136,7 +149,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, * boot. */ pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; if (map_page_size == PUD_SIZE) { @@ -156,7 +170,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, set_the_pte: set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); + asm volatile("ptesync": : :"memory"); return 0; } @@ -168,11 +182,12 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa, } #ifdef CONFIG_STRICT_KERNEL_RWX -void radix__change_memory_range(unsigned long start, unsigned long end, - unsigned long clear) +static void radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long clear) { unsigned long idx; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -185,7 +200,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end, for (idx = start; idx < end; idx += PAGE_SIZE) { pgdp = pgd_offset_k(idx); - pudp = pud_alloc(&init_mm, pgdp, idx); + p4dp = p4d_offset(pgdp, idx); + pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; if (pud_is_leaf(*pudp)) { @@ -214,9 +230,17 @@ void radix__mark_rodata_ro(void) unsigned long start, end; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; radix__change_memory_range(start, end, _PAGE_WRITE); + + for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { + end = start + PAGE_SIZE; + if (overlaps_interrupt_vector_text(start, end)) + radix__change_memory_range(start, end, _PAGE_WRITE); + else + break; + } } void radix__mark_initmem_nx(void) @@ -245,27 +269,50 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX - if (addr < __pa_symbol(__init_begin)) - return __pa_symbol(__init_begin); + unsigned long stext_phys; + + stext_phys = __pa_symbol(_stext); + + // Relocatable kernel running at non-zero real address + if (stext_phys != 0) { + // The end of interrupts code at zero is a rodata boundary + unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; + if (addr < end_intr) + return end_intr; + + // Start of relocated kernel text is a rodata boundary + if (addr < stext_phys) + return stext_phys; + } + + if (addr < __pa_symbol(__srwx_boundary)) + return __pa_symbol(__srwx_boundary); #endif return end; } static int __meminit create_physical_mapping(unsigned long start, unsigned long end, - int nid) + int nid, pgprot_t _prot) { unsigned long vaddr, addr, mapping_size = 0; bool prev_exec, exec = false; pgprot_t prot; int psize; + unsigned long max_mapping_size = memory_block_size; + + if (debug_pagealloc_enabled_or_kfence()) + max_mapping_size = PAGE_SIZE; - start = _ALIGN_UP(start, PAGE_SIZE); + start = ALIGN(start, PAGE_SIZE); + end = ALIGN_DOWN(end, PAGE_SIZE); for (addr = start; addr < end; addr += mapping_size) { unsigned long gap, previous_size; int rc; gap = next_boundary(addr, end) - addr; + if (gap > max_mapping_size) + gap = max_mapping_size; previous_size = mapping_size; prev_exec = exec; @@ -289,7 +336,7 @@ static int __meminit create_physical_mapping(unsigned long start, prot = PAGE_KERNEL_X; exec = true; } else { - prot = PAGE_KERNEL; + prot = _prot; exec = false; } @@ -312,51 +359,40 @@ static int __meminit create_physical_mapping(unsigned long start, static void __init radix_init_pgtable(void) { unsigned long rts_field; - struct memblock_region *reg; + phys_addr_t start, end; + u64 i; /* We don't support slb for radix */ - mmu_slb_size = 0; + slb_set_size(0); + /* - * Create the linear mapping, using standard page size for now + * Create the linear mapping */ - for_each_memblock(memory, reg) { + for_each_mem_range(i, &start, &end) { /* * The memblock allocator is up at this point, so the * page tables will be allocated within the range. No * need or a node (which we don't have yet). */ - if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { + if (end >= RADIX_VMALLOC_START) { pr_warn("Outside the supported range\n"); continue; } - WARN_ON(create_physical_mapping(reg->base, - reg->base + reg->size, - -1)); + WARN_ON(create_physical_mapping(start, end, + -1, PAGE_KERNEL)); } - /* Find out how many PID bits are supported */ - if (cpu_has_feature(CPU_FTR_HVMODE)) { - if (!mmu_pid_bits) - mmu_pid_bits = 20; -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (!cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { /* - * When KVM is possible, we only use the top half of the - * PID space to avoid collisions between host and guest PIDs - * which can cause problems due to prefetch when exiting the - * guest with AIL=3 + * Older versions of KVM on these machines prefer if the + * guest only uses the low 19 PID bits. */ - mmu_base_pid = 1 << (mmu_pid_bits - 1); -#else - mmu_base_pid = 1; -#endif - } else { - /* The guest uses the bottom half of the PID space */ - if (!mmu_pid_bits) - mmu_pid_bits = 19; - mmu_base_pid = 1; + mmu_pid_bits = 19; } + mmu_base_pid = 1; /* * Allocate Partition table and process table for the @@ -435,11 +471,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; - /* Find MMU PID size */ - prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); - if (prop && size == 4) - mmu_pid_bits = be32_to_cpup(prop); - /* Grab page size encodings */ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); if (!prop) @@ -462,6 +493,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, def = &mmu_psize_defs[idx]; def->shift = shift; def->ap = ap; + def->h_rpt_pgsize = psize_to_rpti_pgsize(idx); } /* needed ? */ @@ -477,88 +509,35 @@ void __init radix__early_init_devtree(void) * Try to find the available page sizes in the device-tree */ rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); - if (rc != 0) /* Found */ - goto found; - /* - * let's assume we have page 4k and 64k support - */ - mmu_psize_defs[MMU_PAGE_4K].shift = 12; - mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; - - mmu_psize_defs[MMU_PAGE_64K].shift = 16; - mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; -found: - return; -} - -static void radix_init_amor(void) -{ - /* - * In HV mode, we init AMOR (Authority Mask Override Register) so that - * the hypervisor and guest can setup IAMR (Instruction Authority Mask - * Register), enable key 0 and set it to 1. - * - * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) - */ - mtspr(SPRN_AMOR, (3ul << 62)); -} - -#ifdef CONFIG_PPC_KUEP -void setup_kuep(bool disabled) -{ - if (disabled || !early_radix_enabled()) - return; - - if (smp_processor_id() == boot_cpuid) - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - /* - * Radix always uses key0 of the IAMR to determine if an access is - * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction - * fetch. - */ - mtspr(SPRN_IAMR, (1ul << 62)); -} -#endif - -#ifdef CONFIG_PPC_KUAP -void setup_kuap(bool disabled) -{ - if (disabled || !early_radix_enabled()) - return; - - if (smp_processor_id() == boot_cpuid) { - pr_info("Activating Kernel Userspace Access Prevention\n"); - cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; + if (!rc) { + /* + * No page size details found in device tree. + * Let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize = + psize_to_rpti_pgsize(MMU_PAGE_4K); + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; + mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize = + psize_to_rpti_pgsize(MMU_PAGE_64K); } - - /* Make sure userspace can't change the AMR */ - mtspr(SPRN_UAMOR, 0); - mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); - isync(); + return; } -#endif void __init radix__early_init_mmu(void) { unsigned long lpcr; +#ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_PPC_64K_PAGES /* PAGE_SIZE mappings */ mmu_virtual_psize = MMU_PAGE_64K; #else mmu_virtual_psize = MMU_PAGE_4K; #endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } else - mmu_vmemmap_psize = mmu_virtual_psize; #endif /* * initialize page table size @@ -599,7 +578,6 @@ void __init radix__early_init_mmu(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); - radix_init_amor(); } else { radix_init_pseries(); } @@ -623,15 +601,17 @@ void radix__early_init_mmu_secondary(void) set_ptcr_when_no_uv(__pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - - radix_init_amor(); } radix__switch_mmu_context(NULL, &init_mm); tlbiel_all(); + + /* Make sure userspace can't change the AMR */ + mtspr(SPRN_UAMOR, 0); } -void radix__mmu_cleanup_all(void) +/* Called during kexec sequence with MMU off */ +notrace void radix__mmu_cleanup_all(void) { unsigned long lpcr; @@ -644,21 +624,6 @@ void radix__mmu_cleanup_all(void) } } -void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* - * We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* - * Radix mode is not limited by RMA / VRMA addressing. - */ - ppc64_rma_size = ULONG_MAX; -} - #ifdef CONFIG_MEMORY_HOTPLUG static void free_pte_table(pte_t *pte_start, pmd_t *pmd) { @@ -690,108 +655,108 @@ static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) pud_clear(pud); } -struct change_mapping_params { - pte_t *pte; - unsigned long start; - unsigned long end; - unsigned long aligned_start; - unsigned long aligned_end; -}; - -static int __meminit stop_machine_change_mapping(void *data) +static void free_pud_table(pud_t *pud_start, p4d_t *p4d) { - struct change_mapping_params *params = - (struct change_mapping_params *)data; + pud_t *pud; + int i; - if (!data) - return -1; + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } - spin_unlock(&init_mm.page_table_lock); - pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1); - create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1); - spin_lock(&init_mm.page_table_lock); - return 0; + pud_free(&init_mm, pud_start); + p4d_clear(p4d); } -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) { - unsigned long next; - pte_t *pte; + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); - pte = pte_start + pte_index(addr); - for (; addr < end; addr = next, pte++) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - if (next > end) - next = end; + return !vmemmap_populated(start, PMD_SIZE); +} - if (!pte_present(*pte)) - continue; +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; - } + return !vmemmap_populated(start, PAGE_SIZE); - pte_clear(&init_mm, addr, pte); - } } +#endif -/* - * clear the pte and potentially split the mapping helper - */ -static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, - unsigned long size, pte_t *pte) +static void __meminit free_vmemmap_pages(struct page *page, + struct vmem_altmap *altmap, + int order) { - unsigned long mask = ~(size - 1); - unsigned long aligned_start = addr & mask; - unsigned long aligned_end = addr + size; - struct change_mapping_params params; - bool split_region = false; + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); - if ((end - addr) < size) { /* - * We're going to clear the PTE, but not flushed - * the mapping, time to remap and flush. The - * effects if visible outside the processor or - * if we are running in code close to the - * mapping we cleared, we are in trouble. + * with 2M vmemmap mmaping we can have things setup + * such that even though atlmap is specified we never + * used altmap. */ - if (overlaps_kernel_text(aligned_start, addr) || - overlaps_kernel_text(end, aligned_end)) { - /* - * Hack, just return, don't pte_clear - */ - WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " - "text, not splitting\n", addr, end); + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); return; } - split_region = true; } - if (split_region) { - params.pte = pte; - params.start = addr; - params.end = end; - params.aligned_start = addr & ~(size - 1); - params.aligned_end = min_t(unsigned long, aligned_end, - (unsigned long)__va(memblock_end_of_DRAM())); - stop_machine(stop_machine_change_mapping, ¶ms, NULL); - return; - } + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; + pte_t *pte; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; - pte_clear(&init_mm, addr, pte); + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + pages++; + } +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_page_is_unused(addr, next)) { + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + } +#endif + } + if (direct) + update_page_count(mmu_virtual_psize, -pages); } -static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end) +static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) { - unsigned long next; + unsigned long next, pages = 0; pte_t *pte_base; pmd_t *pmd; @@ -803,20 +768,35 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, continue; if (pmd_is_leaf(*pmd)) { - split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + pages++; + } +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_pmd_is_unused(addr, next)) { + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + } +#endif continue; } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next); + remove_pte_table(pte_base, addr, next, direct, altmap); free_pte_table(pte_base, pmd); } + if (direct) + update_page_count(MMU_PAGE_2M, -pages); } -static void remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end) +static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) { - unsigned long next; + unsigned long next, pages = 0; pmd_t *pmd_base; pud_t *pud; @@ -828,21 +808,32 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr, continue; if (pud_is_leaf(*pud)) { - split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); + if (!IS_ALIGNED(addr, PUD_SIZE) || + !IS_ALIGNED(next, PUD_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + pte_clear(&init_mm, addr, (pte_t *)pud); + pages++; continue; } - pmd_base = (pmd_t *)pud_page_vaddr(*pud); - remove_pmd_table(pmd_base, addr, next); + pmd_base = pud_pgtable(*pud); + remove_pmd_table(pmd_base, addr, next, direct, altmap); free_pmd_table(pmd_base, pud); } + if (direct) + update_page_count(MMU_PAGE_1G, -pages); } -static void __meminit remove_pagetable(unsigned long start, unsigned long end) +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long addr, next; pud_t *pud_base; pgd_t *pgd; + p4d_t *p4d; spin_lock(&init_mm.page_table_lock); @@ -850,35 +841,46 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) continue; - if (pgd_is_leaf(*pgd)) { - split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); + if (p4d_is_leaf(*p4d)) { + if (!IS_ALIGNED(addr, P4D_SIZE) || + !IS_ALIGNED(next, P4D_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, (pte_t *)pgd); continue; } - pud_base = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud_base, addr, next); + pud_base = p4d_pgtable(*p4d); + remove_pud_table(pud_base, addr, next, direct, altmap); + free_pud_table(pud_base, p4d); } spin_unlock(&init_mm.page_table_lock); radix__flush_tlb_kernel_range(start, end); } -int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) +int __meminit radix__create_section_mapping(unsigned long start, + unsigned long end, int nid, + pgprot_t prot) { if (end >= RADIX_VMALLOC_START) { pr_warn("Outside the supported range\n"); return -1; } - return create_physical_mapping(__pa(start), __pa(end), nid); + return create_physical_mapping(__pa(start), __pa(end), + nid, prot); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) { - remove_pagetable(start, end); + remove_pagetable(start, end, true, NULL); return 0; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -896,7 +898,6 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long phys) { /* Create a PTE encoding */ - unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); int ret; @@ -905,20 +906,453 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return -1; } - ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); + ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid); BUG_ON(ret); return 0; } + +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + if (radix_enabled()) + return __vmemmap_can_optimize(altmap, pgmap); + + return false; +} + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmdp); + + if (large) + vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); + + return large; +} + +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + pte_t *ptep = pmdp_ptep(pmdp); + + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, ptep, entry); + asm volatile("ptesync": : :"memory"); + + vmemmap_verify(ptep, node, addr, next); +} + +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, + int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmdp, addr); + + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) + altmap = NULL; + + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p && altmap) + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); + if (!p) + return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); + } + + VM_BUG_ON(!PAGE_ALIGNED(addr)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + asm volatile("ptesync": : :"memory"); + } + return pte; +} + +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, + unsigned long address) +{ + pud_t *pud; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(p4d_none(*p4dp))) { + if (unlikely(!slab_is_available())) { + pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + p4d_populate(&init_mm, p4dp, pud); + /* go to the pud_offset */ + } else + return pud_alloc(&init_mm, p4dp, address); + } + return pud_offset(p4dp, address); +} + +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, + unsigned long address) +{ + pmd_t *pmd; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pud_none(*pudp))) { + if (unlikely(!slab_is_available())) { + pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pud_populate(&init_mm, pudp, pmd); + } else + return pmd_alloc(&init_mm, pudp, address); + } + return pmd_offset(pudp, address); +} + +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, + unsigned long address) +{ + pte_t *pte; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pmd_none(*pmdp))) { + if (unlikely(!slab_is_available())) { + pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pmd_populate(&init_mm, pmdp, pte); + } else + return pte_alloc_kernel(pmdp, address); + } + return pte_offset_kernel(pmdp, address); +} + + + +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + /* + * keep it simple by checking addr PMD_SIZE alignment + * and verifying the device boundary condition. + * For us to use a pmd mapping, both addr and pfn should + * be aligned. We skip if addr is not aligned and for + * pfn we hope we have extra area in the altmap that + * can help to find an aligned block. This can result + * in altmap block allocation failures, in which case + * we fallback to RAM for vmemmap allocation. + */ + if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + goto base_mapping; + } + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); + continue; + } else if (altmap) { + /* + * A vmemmap block allocation can fail due to + * alignment requirements and we trying to align + * things aggressively there by running out of + * space. Try base mapping on failure. + */ + goto base_mapping; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) { + /* + * If a huge mapping exist due to early call to + * vmemmap_populate, let's try to use that. + */ + continue; + } +base_mapping: + /* + * Not able allocate higher order memory to back memmap + * or we found a pointer to pte page. Allocate base page + * size vmemmap + */ + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + + pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); + if (!pte) + return -ENOMEM; + + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + next = addr + PAGE_SIZE; + } + return 0; +} + +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return NULL; + radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, + unsigned long pfn_offset, int node) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long map_addr; + + /* the second vmemmap page which we use for duplication */ + map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; + pgd = pgd_offset_k(map_addr); + p4d = p4d_offset(pgd, map_addr); + pud = vmemmap_pud_alloc(p4d, node, map_addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, map_addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, map_addr); + if (!pte) + return NULL; + /* + * Check if there exist a mapping to the left + */ + if (pte_none(*pte)) { + /* + * Populate the head page vmemmap page. + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); + if (!pte) + return NULL; + /* + * Populate the tail pages vmemmap page + */ + pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); + if (!pte) + return NULL; + vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); + return pte; + } + return pte; +} + +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + /* + * we want to map things as base page size mapping so that + * we can save space in vmemmap. We could have huge mapping + * covering out both edges. + */ + unsigned long addr; + unsigned long addr_pfn = start_pfn; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_leaf(READ_ONCE(*pmd))) { + /* existing huge mapping. Skip the range */ + addr_pfn += (PMD_SIZE >> PAGE_SHIFT); + next = pmd_addr_end(addr, end); + continue; + } + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + /* + * This could be because we already have a compound + * page whose VMEMMAP_RESERVE_NR pages were mapped and + * this request fall in those pages. + */ + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } else { + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); + pte_t *tail_page_pte; + + /* + * if the address is aligned to huge page size it is the + * head mapping. + */ + if (pfn_offset == 0) { + /* Populate the head page vmemmap page */ + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + /* + * Populate the tail pages vmemmap page + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + addr_pfn += 2; + next = addr + 2 * PAGE_SIZE; + continue; + } + /* + * get the 2nd mapping details + * Also create it if that doesn't exist + */ + tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); + if (!tail_page_pte) { + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + } + return 0; +} + + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - remove_pagetable(start, start + page_size); + remove_pagetable(start, start + page_size, true, NULL); +} + +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + remove_pagetable(start, end, false, altmap); } #endif #endif +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +void radix__kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long addr; + + addr = (unsigned long)page_address(page); + + if (enable) + set_memory_p(addr, numpages); + else + set_memory_np(addr, numpages); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, @@ -932,8 +1366,25 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add assert_spin_locked(pmd_lockptr(mm, pmdp)); #endif - old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); - trace_hugepage_update(addr, old, clr, set); + old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); + trace_hugepage_update_pmd(addr, old, clr, set); + + return old; +} + +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update_pud(addr, old, clr, set); return old; } @@ -953,9 +1404,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre pmd = *pmdp; pmd_clear(pmdp); - /*FIXME!! Verify whether we need this kick below */ - serialize_against_pte_lookup(vma->vm_mm); - radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); return pmd; @@ -1014,41 +1462,46 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); old_pmd = __pmd(old); - /* - * Serialize against find_current_mm_pte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); return old_pmd; } +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, unsigned long address, int psize) { struct mm_struct *mm = vma->vm_mm; - unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | - _PAGE_RW | _PAGE_EXEC); + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY | + _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); unsigned long change = pte_val(entry) ^ pte_val(*ptep); /* - * To avoid NMMU hang while relaxing access, we need mark - * the pte invalid in between. + * On POWER9, the NMMU is not able to relax PTE access permissions + * for a translation with a TLB. The PTE must be invalidated, TLB + * flushed before the new PTE is installed. + * + * This only needs to be done for radix, because hash translation does + * flush when updating the linux pte (and we don't support NMMU + * accelerators on HPT on POWER9 anyway XXX: do we?). + * + * POWER10 (and P9P) NMMU does behave as per ISA. */ - if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { + if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) && + atomic_read(&mm->context.copros) > 0) { unsigned long old_pte, new_pte; old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); - /* - * new value of pte - */ new_pte = old_pte | set; radix__flush_tlb_page_psize(mm, address, psize); __radix_pte_update(ptep, _PAGE_INVALID, new_pte); @@ -1056,9 +1509,12 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, __radix_pte_update(ptep, 0, set); /* * Book3S does not require a TLB flush when relaxing access - * restrictions when the address space is not attached to a - * NMMU, because the core MMU will reload the pte after taking - * an access fault, which is defined by the architectue. + * restrictions when the address space (modulo the POWER9 nest + * MMU issue above) because the MMU will reload the PTE after + * taking an access fault, as defined by the architecture. See + * "Setting a Reference or Change Bit or Upgrading Access + * Authority (PTE Subject to Atomic Hardware Updates)" in + * Power ISA Version 3.1B. */ } /* See ptesync comment in radix__set_pte_at */ @@ -1071,33 +1527,18 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. We need to do this only for radix, because hash - * translation does flush when updating the linux pte. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && (atomic_read(&mm->context.copros) > 0)) radix__flush_tlb_page(vma, addr); set_pte_at(mm, addr, ptep, pte); } -int __init arch_ioremap_pud_supported(void) -{ - /* HPT does not cope with large pages in the vmalloc area */ - return radix_enabled(); -} - -int __init arch_ioremap_pmd_supported(void) -{ - return radix_enabled(); -} - -int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) -{ - return 0; -} - int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { pte_t *ptep = (pte_t *)pud; @@ -1113,7 +1554,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) int pud_clear_huge(pud_t *pud) { - if (pud_huge(*pud)) { + if (pud_is_leaf(*pud)) { pud_clear(pud); return 1; } @@ -1126,7 +1567,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) pmd_t *pmd; int i; - pmd = (pmd_t *)pud_page_vaddr(*pud); + pmd = pud_pgtable(*pud); pud_clear(pud); flush_tlb_kernel_range(addr, addr + PUD_SIZE); @@ -1160,7 +1601,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) int pmd_clear_huge(pmd_t *pmd) { - if (pmd_huge(*pmd)) { + if (pmd_is_leaf(*pmd)) { pmd_clear(pmd); return 1; } @@ -1181,8 +1622,3 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) return 1; } - -int __init arch_ioremap_p4d_supported(void) -{ - return 0; -} diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index a95175c0972b..9e1f6558d026 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -10,16 +10,16 @@ #include <linux/memblock.h> #include <linux/mmu_context.h> #include <linux/sched/mm.h> +#include <linux/debugfs.h> #include <asm/ppc-opcode.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/trace.h> #include <asm/cputhreads.h> +#include <asm/plpar_wrappers.h> -#define RIC_FLUSH_TLB 0 -#define RIC_FLUSH_PWC 1 -#define RIC_FLUSH_ALL 2 +#include "internal.h" /* * tlbiel instruction for radix, set invalidation @@ -55,16 +55,23 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) if (early_cpu_has_feature(CPU_FTR_HVMODE)) { /* MSR[HV] should flush partition scope translations first. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + + if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) { + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, + RIC_FLUSH_TLB, 0); + } } /* Flush process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); - asm volatile("ptesync": : :"memory"); + if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) { + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + } + + ppc_after_tlbiel_barrier(); } void radix__tlbiel_all(unsigned int action) @@ -244,7 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid) } } - static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long ap) { @@ -281,29 +287,39 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) /* * We use 128 set in radix mode and 256 set in hpt mode. */ -static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { int set; asm volatile("ptesync": : :"memory"); - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_pid(pid, 0, ric); + switch (ric) { + case RIC_FLUSH_PWC: - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); + /* For PWC, only one flush is needed */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + ppc_after_tlbiel_barrier(); return; + case RIC_FLUSH_TLB: + __tlbiel_pid(pid, 0, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_ALL: + default: + /* + * Flush the first set of the TLB, and if + * we're doing a RIC_FLUSH_ALL, also flush + * the entire Page Walk Cache. + */ + __tlbiel_pid(pid, 0, RIC_FLUSH_ALL); } - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_pid(pid, set, RIC_FLUSH_TLB); + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); + } - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory"); } @@ -313,7 +329,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) /* * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint + * must be a compile-time constraint to match the "i" constraint * in the asm statement. */ switch (ric) { @@ -430,7 +446,7 @@ static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid, asm volatile("ptesync": : :"memory"); __tlbiel_va(va, pid, ap, ric); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } static inline void _tlbiel_va_range(unsigned long start, unsigned long end, @@ -441,7 +457,7 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end, if (also_pwc) __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); __tlbiel_va_range(start, end, pid, page_size, psize); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } static inline void __tlbie_va_range(unsigned long start, unsigned long end, @@ -565,12 +581,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, */ void radix__local_flush_tlb_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_TLB); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); @@ -578,26 +595,33 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP void radix__local_flush_all_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_all_mm); + +static void __flush_all_mm(struct mm_struct *mm, bool fullmm) +{ + radix__local_flush_all_mm(mm); +} #endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -612,47 +636,86 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd } EXPORT_SYMBOL(radix__local_flush_tlb_page); -static bool mm_is_singlethreaded(struct mm_struct *mm) -{ - if (atomic_read(&mm->context.copros) > 0) - return false; - if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) - return true; - return false; -} - static bool mm_needs_flush_escalation(struct mm_struct *mm) { /* - * P9 nest MMU has issues with the page walk cache - * caching PTEs and not flushing them properly when - * RIC = 0 for a PID/LPID invalidate + * The P9 nest MMU has issues with the page walk cache caching PTEs + * and not flushing them when RIC = 0 for a PID/LPID invalidate. + * + * This may have been fixed in shipping firmware (by disabling PWC + * or preventing it from caching PTEs), but until that is confirmed, + * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes + * to RIC=2. + * + * POWER10 (and P9P) does not have this problem. */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + return false; if (atomic_read(&mm->context.copros) > 0) return true; return false; } -#ifdef CONFIG_SMP -static void do_exit_flush_lazy_tlb(void *arg) +/* + * If always_flush is true, then flush even if this CPU can't be removed + * from mm_cpumask. + */ +void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) { - struct mm_struct *mm = arg; unsigned long pid = mm->context.id; + int cpu = smp_processor_id(); + /* + * A kthread could have done a mmget_not_zero() after the flushing CPU + * checked mm_cpumask, and be in the process of kthread_use_mm when + * interrupted here. In that case, current->mm will be set to mm, + * because kthread_use_mm() setting ->mm and switching to the mm is + * done with interrupts off. + */ if (current->mm == mm) - return; /* Local CPU */ + goto out; if (current->active_mm == mm) { + unsigned long flags; + + WARN_ON_ONCE(current->mm != NULL); /* - * Must be a kernel thread because sender is single-threaded. + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. */ - BUG_ON(current->mm); - mmgrab(&init_mm); - switch_mm(mm, &init_mm, current); + local_irq_save(flags); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; - mmdrop(mm); + switch_mm_irqs_off(mm, &init_mm, current); + mmdrop_lazy_tlb(mm); + local_irq_restore(flags); } - _tlbiel_pid(pid, RIC_FLUSH_ALL); + + /* + * This IPI may be initiated from any source including those not + * running the mm, so there may be a racing IPI that comes after + * this one which finds the cpumask already clear. Check and avoid + * underflowing the active_cpus count in that case. The race should + * not otherwise be a problem, but the TLB must be flushed because + * that's what the caller expects. + */ + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { + dec_mm_active_cpus(mm); + cpumask_clear_cpu(cpu, mm_cpumask(mm)); + always_flush = true; + } + +out: + if (always_flush) + _tlbiel_pid(pid, RIC_FLUSH_ALL); +} + +#ifdef CONFIG_SMP +static void do_exit_flush_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + exit_lazy_flush_tlb(mm, true); } static void exit_flush_lazy_tlbs(struct mm_struct *mm) @@ -666,30 +729,136 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm) */ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, (void *)mm, 1); - mm_reset_thread_local(mm); } +#else /* CONFIG_SMP */ +static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { } +#endif /* CONFIG_SMP */ + +static DEFINE_PER_CPU(unsigned int, mm_cpumask_trim_clock); + +/* + * Interval between flushes at which we send out IPIs to check whether the + * mm_cpumask can be trimmed for the case where it's not a single-threaded + * process flushing its own mm. The intent is to reduce the cost of later + * flushes. Don't want this to be so low that it adds noticable cost to TLB + * flushing, or so high that it doesn't help reduce global TLBIEs. + */ +static unsigned long tlb_mm_cpumask_trim_timer = 1073; + +static bool tick_and_test_trim_clock(void) +{ + if (__this_cpu_inc_return(mm_cpumask_trim_clock) == + tlb_mm_cpumask_trim_timer) { + __this_cpu_write(mm_cpumask_trim_clock, 0); + return true; + } + return false; +} + +enum tlb_flush_type { + FLUSH_TYPE_NONE, + FLUSH_TYPE_LOCAL, + FLUSH_TYPE_GLOBAL, +}; + +static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm) +{ + int active_cpus = atomic_read(&mm->context.active_cpus); + int cpu = smp_processor_id(); + + if (active_cpus == 0) + return FLUSH_TYPE_NONE; + if (active_cpus == 1 && cpumask_test_cpu(cpu, mm_cpumask(mm))) { + if (current->mm != mm) { + /* + * Asynchronous flush sources may trim down to nothing + * if the process is not running, so occasionally try + * to trim. + */ + if (tick_and_test_trim_clock()) { + exit_lazy_flush_tlb(mm, true); + return FLUSH_TYPE_NONE; + } + } + return FLUSH_TYPE_LOCAL; + } + + /* Coprocessors require TLBIE to invalidate nMMU. */ + if (atomic_read(&mm->context.copros) > 0) + return FLUSH_TYPE_GLOBAL; + + /* + * In the fullmm case there's no point doing the exit_flush_lazy_tlbs + * because the mm is being taken down anyway, and a TLBIE tends to + * be faster than an IPI+TLBIEL. + */ + if (fullmm) + return FLUSH_TYPE_GLOBAL; + + /* + * If we are running the only thread of a single-threaded process, + * then we should almost always be able to trim off the rest of the + * CPU mask (except in the case of use_mm() races), so always try + * trimming the mask. + */ + if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) { + exit_flush_lazy_tlbs(mm); + /* + * use_mm() race could prevent IPIs from being able to clear + * the cpumask here, however those users are established + * after our first check (and so after the PTEs are removed), + * and the TLB still gets flushed by the IPI, so this CPU + * will only require a local flush. + */ + return FLUSH_TYPE_LOCAL; + } + + /* + * Occasionally try to trim down the cpumask. It's possible this can + * bring the mask to zero, which results in no flush. + */ + if (tick_and_test_trim_clock()) { + exit_flush_lazy_tlbs(mm); + if (current->mm == mm) + return FLUSH_TYPE_LOCAL; + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + exit_lazy_flush_tlb(mm, true); + return FLUSH_TYPE_NONE; + } + + return FLUSH_TYPE_GLOBAL; +} + +#ifdef CONFIG_SMP void radix__flush_tlb_mm(struct mm_struct *mm) { unsigned long pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); /* - * Order loads of mm_cpumask vs previous stores to clear ptes before - * the invalidate. See barrier in switch_mm_irqs_off + * Order loads of mm_cpumask (in flush_type_needed) vs previous + * stores to clear ptes before the invalidate. See barrier in + * switch_mm_irqs_off */ smp_mb(); - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - - if (cputlb_use_tlbie()) { + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_TLB); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) { if (mm_needs_flush_escalation(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else @@ -697,40 +866,43 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } else { _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); } - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_TLB); } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); static void __flush_all_mm(struct mm_struct *mm, bool fullmm) { unsigned long pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (!fullmm) { - exit_flush_lazy_tlbs(mm); - goto local; - } - } - if (cputlb_use_tlbie()) + type = flush_type_needed(mm, fullmm); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -743,25 +915,34 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { unsigned long pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - if (cputlb_use_tlbie()) + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, pg_sizes, size; + + tgt = H_RPTI_TARGET_CMMU; + pg_sizes = psize_to_rpti_pgsize(psize); + size = 1UL << mmu_psize_to_shift(psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + pg_sizes, vmaddr, + vmaddr + size); + } else if (cputlb_use_tlbie()) _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); else _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); - } else { -local: - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); } preempt_enable(); } @@ -776,8 +957,6 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) } EXPORT_SYMBOL(radix__flush_tlb_page); -#else /* CONFIG_SMP */ -#define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ static void do_tlbiel_kernel(void *info) @@ -805,13 +984,23 @@ static inline void _tlbiel_kernel_broadcast(void) */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { - if (cputlb_use_tlbie()) + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL, + start, end); + } else if (cputlb_use_tlbie()) _tlbie_pid(0, RIC_FLUSH_ALL); else _tlbiel_kernel_broadcast(); } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +/* + * Doesn't appear to be used anywhere. Remove. + */ #define TLB_FLUSH_ALL -1UL /* @@ -823,77 +1012,90 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * invalidating a full PID, so it has a far lower threshold to change from * individual page flushes to full-pid flushes. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) - { unsigned long pid; unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; + bool flush_pid, flush_pwc = false; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_NONE) + goto out; - if (full) { - if (local) { - _tlbiel_pid(pid, RIC_FLUSH_TLB); + if (type == FLUSH_TYPE_GLOBAL) + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + else + flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; + /* + * full pid flush already does the PWC flush. if it is not full pid + * flush check the range is more than PMD and force a pwc flush + * mremap() depends on this behaviour. + */ + if (!flush_pid && (end - start) >= PMD_SIZE) + flush_pwc = true; + + if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) { + unsigned long type = H_RPTI_TYPE_TLB; + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M); + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + if (flush_pwc) + type |= H_RPTI_TYPE_PWC; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end); + } else if (flush_pid) { + /* + * We are now flushing a range larger than PMD size force a RIC_FLUSH_ALL + */ + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_ALL); } else { if (cputlb_use_tlbie()) { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + _tlbie_pid(pid, RIC_FLUSH_ALL); } else { - _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } } } else { - bool hflush = false; + bool hflush; unsigned long hstart, hend; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - hstart = (start + PMD_SIZE - 1) & PMD_MASK; - hend = end & PMD_MASK; - if (hstart == hend) - hflush = false; - else - hflush = true; - } + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend; - if (local) { + if (type == FLUSH_TYPE_LOCAL) { asm volatile("ptesync": : :"memory"); + if (flush_pwc) + /* For PWC, only one flush is needed */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } else if (cputlb_use_tlbie()) { asm volatile("ptesync": : :"memory"); + if (flush_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, @@ -901,13 +1103,15 @@ is_local: asm volatile("eieio; tlbsync; ptesync": : :"memory"); } else { _tlbiel_va_range_multicast(mm, - start, end, pid, page_size, mmu_virtual_psize, false); + start, end, pid, page_size, mmu_virtual_psize, flush_pwc); if (hflush) _tlbiel_va_range_multicast(mm, - hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, flush_pwc); } } +out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -977,9 +1181,6 @@ void radix__flush_all_lpid_guest(unsigned int lpid) _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize); - void radix__tlb_flush(struct mmu_gather *tlb) { int psize = 0; @@ -995,8 +1196,29 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (tlb->fullmm || tlb->need_flush_all) { - __flush_all_mm(mm, true); + if (tlb->fullmm) { + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * Shootdown based lazy tlb mm refcounting means we + * have to IPI everyone in the mm_cpumask anyway soon + * when the mm goes away, so might as well do it as + * part of the final flush now. + * + * If lazy shootdown was improved to reduce IPIs (e.g., + * by batching), then it may end up being better to use + * tlbies here instead. + */ + preempt_disable(); + + smp_mb(); /* see radix__flush_tlb_mm */ + exit_flush_lazy_tlbs(mm); + __flush_all_mm(mm, true); + + preempt_enable(); + } else { + __flush_all_mm(mm, true); + } + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->freed_tables) radix__flush_tlb_mm(mm); @@ -1010,7 +1232,7 @@ void radix__tlb_flush(struct mmu_gather *tlb) } } -static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, +static void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize, bool also_pwc) { @@ -1018,33 +1240,38 @@ static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; + bool flush_pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_NONE) + goto out; - if (full) { - if (local) { + if (type == FLUSH_TYPE_GLOBAL) + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + else + flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; + + if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB; + unsigned long pg_sizes = psize_to_rpti_pgsize(psize); + + if (also_pwc) + type |= H_RPTI_TYPE_PWC; + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end); + } else if (flush_pid) { + if (type == FLUSH_TYPE_LOCAL) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } else { if (cputlb_use_tlbie()) { @@ -1060,7 +1287,7 @@ is_local: } } else { - if (local) + if (type == FLUSH_TYPE_LOCAL) _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); else if (cputlb_use_tlbie()) _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); @@ -1068,7 +1295,9 @@ is_local: _tlbiel_va_range_multicast(mm, start, end, pid, page_size, psize, also_pwc); } +out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, @@ -1077,8 +1306,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, return __radix__flush_tlb_range_psize(mm, start, end, psize, false); } -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize) +void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) { __radix__flush_tlb_range_psize(mm, start, end, psize, true); } @@ -1087,9 +1316,10 @@ static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { unsigned long pid, end; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; /* 4k page size, just blow the world */ @@ -1103,19 +1333,27 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) /* Otherwise first do the PWC, then iterate the pages. */ preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - if (cputlb_use_tlbie()) + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, type, pg_sizes; + + tgt = H_RPTI_TARGET_CMMU; + type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, + addr, end); + } else if (cputlb_use_tlbie()) _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); else _tlbiel_va_range_multicast(mm, addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - } else { -local: - _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } preempt_enable(); @@ -1129,6 +1367,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; @@ -1154,44 +1399,189 @@ void radix__flush_tlb_all(void) } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) +static __always_inline void __tlbie_pid_lpid(unsigned long pid, + unsigned long lpid, + unsigned long ric) { - unsigned long pid = mm->context.id; + unsigned long rb, rs, prs, r; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; + rb = PPC_BIT(53); /* IS = 1 */ + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid, + unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb, rs, prs, r; + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid) +{ /* - * If this context hasn't run on that CPU before and KVM is - * around, there's a slim chance that the guest on another - * CPU just brought in obsolete translation into the TLB of - * this CPU due to a bad prefetch using the guest PID on - * the way into the hypervisor. - * - * We work around this here. If KVM is possible, we check if - * any sibling thread is in KVM. If it is, the window may exist - * and thus we flush that PID from the core. - * - * A potential future improvement would be to mark which PIDs - * have never been used on the system and avoid it if the PID - * is new and the process has no other cpumask bit set. + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. */ - if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) { - int cpu = smp_processor_id(); - int sib = cpu_first_thread_sibling(cpu); - bool flush = false; - - for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { - if (sib == cpu) - continue; - if (!cpu_possible(sib)) - continue; - if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) - flush = true; + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K), + RIC_FLUSH_TLB); + } +} + +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid, + unsigned long ric) +{ + asm volatile("ptesync" : : : "memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + fixup_tlbie_pid_lpid(pid, lpid); + break; + case RIC_FLUSH_PWC: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + fixup_tlbie_pid_lpid(pid, lpid); + } + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +static inline void fixup_tlbie_va_range_lpid(unsigned long va, + unsigned long pid, + unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB); + } +} + +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap); +} + +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync" : : : "memory"); + if (also_pwc) + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +/* + * Performs process-scoped invalidations for a given LPID + * as part of H_RPT_INVALIDATE hcall. + */ +void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, + unsigned long type, unsigned long pg_sizes, + unsigned long start, unsigned long end) +{ + unsigned long psize, nr_pages; + struct mmu_psize_def *def; + bool flush_pid; + + /* + * A H_RPTI_TYPE_ALL request implies RIC=3, hence + * do a single IS=1 based flush. + */ + if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) { + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + return; + } + + if (type & H_RPTI_TYPE_PWC) + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + + /* Full PID flush */ + if (start == 0 && end == -1) + return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + + /* Do range invalidation for all the valid page sizes */ + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + def = &mmu_psize_defs[psize]; + if (!(pg_sizes & def->h_rpt_pgsize)) + continue; + + nr_pages = (end - start) >> def->shift; + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + + /* + * If the number of pages spanning the range is above + * the ceiling, convert the request into a full PID flush. + * And since PID flush takes out all the page sizes, there + * is no need to consider remaining page sizes. + */ + if (flush_pid) { + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + return; } - if (flush) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbie_va_range_lpid(start, end, pid, lpid, + (1UL << def->shift), psize, false); } } -EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround); +EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 716204aee3da..f2708c8629a5 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -9,11 +9,11 @@ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM */ -#include <asm/asm-prototypes.h> -#include <asm/pgtable.h> +#include <asm/interrupt.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/paca.h> +#include <asm/lppaca.h> #include <asm/ppc-opcode.h> #include <asm/cputable.h> #include <asm/cacheflush.h> @@ -21,38 +21,26 @@ #include <linux/compiler.h> #include <linux/context_tracking.h> #include <linux/mm_types.h> +#include <linux/pgtable.h> #include <asm/udbg.h> #include <asm/code-patching.h> -enum slb_index { - LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ - KSTACK_INDEX = 1, /* Kernel stack map */ -}; +#include "internal.h" -static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); -#define slb_esid_mask(ssize) \ - (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); -static inline unsigned long mk_esid_data(unsigned long ea, int ssize, - enum slb_index index) -{ - return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; -} +bool stress_slb_enabled __initdata; -static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, - unsigned long flags) +static int __init parse_stress_slb(char *p) { - return (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); + stress_slb_enabled = true; + return 0; } +early_param("stress_slb", parse_stress_slb); -static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, - unsigned long flags) -{ - return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); -} +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key); static void assert_slb_presence(bool present, unsigned long ea) { @@ -68,7 +56,7 @@ static void assert_slb_presence(bool present, unsigned long ea) * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware * ignores all other bits from 0-27, so just clear them all. */ - ea &= ~((1UL << 28) - 1); + ea &= ~((1UL << SID_SHIFT) - 1); asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); WARN_ON(present == (tmp == 0)); @@ -153,14 +141,42 @@ void slb_flush_all_realmode(void) asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } +static __always_inline void __slb_flush_and_restore_bolted(bool preserve_kernel_lookaside) +{ + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data, ksp_vsid_data; + u32 ih; + + /* + * SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside + * information created with Class=0 entries, which we use for kernel + * SLB entries (the SLB entries themselves are still invalidated). + * + * Older processors will ignore this optimisation. Over-invalidation + * is fine because we never rely on lookaside information existing. + */ + if (preserve_kernel_lookaside) + ih = 1; + else + ih = 0; + + ksp_esid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile(PPC_SLBIA(%0)" \n" + "slbmte %1, %2 \n" + :: "i" (ih), + "r" (ksp_vsid_data), + "r" (ksp_esid_data) + : "memory"); +} + /* * This flushes non-bolted entries, it can be run in virtual mode. Must * be called with interrupts disabled. */ void slb_flush_and_restore_bolted(void) { - struct slb_shadow *p = get_slb_shadow(); - BUILD_BUG_ON(SLB_NUM_BOLTED != 2); WARN_ON(!irqs_disabled()); @@ -171,13 +187,10 @@ void slb_flush_and_restore_bolted(void) */ hard_irq_disable(); - asm volatile("isync\n" - "slbia\n" - "slbmte %0, %1\n" - "isync\n" - :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), - "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) - : "memory"); + isync(); + __slb_flush_and_restore_bolted(false); + isync(); + assert_slb_presence(true, get_paca()->kstack); get_paca()->slb_cache_ptr = 0; @@ -216,7 +229,6 @@ void slb_dump_contents(struct slb_entry *slb_ptr) return; pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); - pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); for (i = 0; i < mmu_slb_size; i++) { e = slb_ptr->esid; @@ -226,34 +238,38 @@ void slb_dump_contents(struct slb_entry *slb_ptr) if (!e && !v) continue; - pr_err("%02d %016lx %016lx\n", i, e, v); + pr_err("%02d %016lx %016lx %s\n", i, e, v, + (e & SLB_ESID_V) ? "VALID" : "NOT VALID"); - if (!(e & SLB_ESID_V)) { - pr_err("\n"); + if (!(e & SLB_ESID_V)) continue; - } + llp = v & SLB_VSID_LLP; if (v & SLB_VSID_B_1T) { - pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", + pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", GET_ESID_1T(e), (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); } else { - pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", + pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", GET_ESID(e), (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); } } - pr_err("----------------------------------\n"); - - /* Dump slb cache entires as well. */ - pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); - pr_err("Valid SLB cache entries:\n"); - n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); - for (i = 0; i < n; i++) - pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); - pr_err("Rest of SLB cache entries:\n"); - for (i = n; i < SLB_CACHE_ENTRIES; i++) - pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + /* RR is not so useful as it's often not used for allocation */ + pr_err("SLB RR allocator index %d\n", get_paca()->stab_rr); + + /* Dump slb cache entires as well. */ + pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); + pr_err("Valid SLB cache entries:\n"); + n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); + for (i = 0; i < n; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + pr_err("Rest of SLB cache entries:\n"); + for (i = n; i < SLB_CACHE_ENTRIES; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + } } void slb_vmalloc_update(void) @@ -332,7 +348,7 @@ void slb_setup_new_exec(void) /* * We have no good place to clear the slb preload cache on exec, * flush_thread is about the earliest arch hook but that happens - * after we switch to the mm and have aleady preloaded the SLBEs. + * after we switch to the mm and have already preloaded the SLBEs. * * For the most part that's probably okay to use entries from the * previous exec, they will age out if unused. It may turn out to @@ -400,6 +416,30 @@ void preload_new_slb_context(unsigned long start, unsigned long sp) local_irq_enable(); } +static void slb_cache_slbie_kernel(unsigned int index) +{ + unsigned long slbie_data = get_paca()->slb_cache[index]; + unsigned long ksp = get_paca()->kstack; + + slbie_data <<= SID_SHIFT; + slbie_data |= 0xc000000000000000ULL; + if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data) + return; + slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT; + + asm volatile("slbie %0" : : "r" (slbie_data)); +} + +static void slb_cache_slbie_user(unsigned int index) +{ + unsigned long slbie_data = get_paca()->slb_cache[index]; + + slbie_data <<= SID_SHIFT; + slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + + asm volatile("slbie %0" : : "r" (slbie_data)); +} /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) @@ -414,8 +454,14 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * which would update the slb_cache/slb_cache_ptr fields in the PACA. */ hard_irq_disable(); - asm volatile("isync" : : : "memory"); - if (cpu_has_feature(CPU_FTR_ARCH_300)) { + isync(); + if (stress_slb()) { + __slb_flush_and_restore_bolted(false); + isync(); + get_paca()->slb_cache_ptr = 0; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + + } else if (cpu_has_feature(CPU_FTR_ARCH_300)) { /* * SLBIA IH=3 invalidates all Class=1 SLBEs and their * associated lookaside structures, which matches what @@ -423,47 +469,29 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * cache. */ asm volatile(PPC_SLBIA(3)); + } else { unsigned long offset = get_paca()->slb_cache_ptr; if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { - unsigned long slbie_data = 0; - - for (i = 0; i < offset; i++) { - unsigned long ea; - - ea = (unsigned long) - get_paca()->slb_cache[i] << SID_SHIFT; - /* - * Could assert_slb_presence(true) here, but - * hypervisor or machine check could have come - * in and removed the entry at this point. - */ - - slbie_data = ea; - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* user slbs have C=1 */ - asm volatile("slbie %0" : : "r" (slbie_data)); - } + /* + * Could assert_slb_presence(true) here, but + * hypervisor or machine check could have come + * in and removed the entry at this point. + */ + + for (i = 0; i < offset; i++) + slb_cache_slbie_user(i); /* Workaround POWER5 < DD2.1 issue */ if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) - asm volatile("slbie %0" : : "r" (slbie_data)); + slb_cache_slbie_user(0); } else { - struct slb_shadow *p = get_slb_shadow(); - unsigned long ksp_esid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].esid); - unsigned long ksp_vsid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); - - asm volatile(PPC_SLBIA(1) "\n" - "slbmte %0,%1\n" - "isync" - :: "r"(ksp_vsid_data), - "r"(ksp_esid_data)); + /* Flush but retain kernel lookaside information */ + __slb_flush_and_restore_bolted(true); + isync(); get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; } @@ -503,7 +531,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * address accesses by the kernel (user mode won't happen until * rfid, which is safe). */ - asm volatile("isync" : : : "memory"); + isync(); } void slb_set_size(u16 size) @@ -571,6 +599,9 @@ static void slb_cache_update(unsigned long esid_data) if (cpu_has_feature(CPU_FTR_ARCH_300)) return; /* ISAv3.0B and later does not use slb_cache */ + if (stress_slb()) + return; + /* * Now update slb cache entries */ @@ -580,12 +611,12 @@ static void slb_cache_update(unsigned long esid_data) * We have space in slb cache for optimized switch_slb(). * Top 36 bits from esid_data as per ISA */ - local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; + local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT; local_paca->slb_cache_ptr++; } else { /* * Our cache is full and the current cache content strictly - * doesn't indicate the active SLB conents. Bump the ptr + * doesn't indicate the active SLB contents. Bump the ptr * so that switch_slb() will ignore the cache. */ local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; @@ -671,6 +702,28 @@ static long slb_insert_entry(unsigned long ea, unsigned long context, * accesses user memory before it returns to userspace with rfid. */ assert_slb_presence(false, ea); + if (stress_slb()) { + int slb_cache_index = local_paca->slb_cache_ptr; + + /* + * stress_slb() does not use slb cache, repurpose as a + * cache of inserted (non-bolted) kernel SLB entries. All + * non-bolted kernel entries are flushed on any user fault, + * or if there are already 3 non-boled kernel entries. + */ + BUILD_BUG_ON(SLB_CACHE_ENTRIES < 3); + if (!kernel || slb_cache_index == 3) { + int i; + + for (i = 0; i < slb_cache_index; i++) + slb_cache_slbie_kernel(i); + slb_cache_index = 0; + } + + if (kernel) + local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT; + local_paca->slb_cache_ptr = slb_cache_index; + } asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); barrier(); @@ -689,8 +742,8 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) if (id == LINEAR_MAP_REGION_ID) { - /* We only support upto MAX_PHYSMEM_BITS */ - if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) + /* We only support upto H_MAX_PHYSMEM_BITS */ + if ((ea & EA_MASK) > (1UL << H_MAX_PHYSMEM_BITS)) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; @@ -761,30 +814,33 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) return slb_insert_entry(ea, context, flags, ssize, false); } -long do_slb_fault(struct pt_regs *regs, unsigned long ea) +DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) { + unsigned long ea = regs->dar; unsigned long id = get_region_id(ea); /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); - if (unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) return -EINVAL; /* - * SLB kernel faults must be very careful not to touch anything - * that is not bolted. E.g., PACA and global variables are okay, - * mm->context stuff is not. - * - * SLB user faults can access all of kernel memory, but must be - * careful not to touch things like IRQ state because it is not - * "reconciled" here. The difficulty is that we must use - * fast_exception_return to return from kernel SLB faults without - * looking at possible non-bolted memory. We could test user vs - * kernel faults in the interrupt handler asm and do a full fault, - * reconcile, ret_from_except for user faults which would make them - * first class kernel code. But for performance it's probably nicer - * if they go via fast_exception_return too. + * SLB kernel faults must be very careful not to touch anything that is + * not bolted. E.g., PACA and global variables are okay, mm->context + * stuff is not. SLB user faults may access all of memory (and induce + * one recursive SLB kernel fault), so the kernel fault must not + * trample on the user fault state at those points. + */ + + /* + * This is a raw interrupt handler, for performance, so that + * fast_interrupt_return can be used. The handler must not touch local + * irq state, or schedule. We could test for usermode and upgrade to a + * normal process context (synchronous) interrupt for those, which + * would make them first-class kernel code and able to be traced and + * instrumented, although performance would suffer a bit, it would + * probably be a good tradeoff. */ if (id >= LINEAR_MAP_REGION_ID) { long err; @@ -812,17 +868,3 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea) return err; } } - -void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) -{ - if (err == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, ea); - else - bad_page_fault(regs, ea, SIGSEGV); - } else if (err == -EINVAL) { - unrecoverable_exception(regs); - } else { - BUG(); - } -} diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c new file mode 100644 index 000000000000..c0b58afb9a47 --- /dev/null +++ b/arch/powerpc/mm/book3s64/slice.c @@ -0,0 +1,807 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * address space "slices" (meta-segments) support + * + * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation. + * + * Based on hugetlb implementation + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/hugetlb.h> +#include <linux/sched/mm.h> +#include <linux/security.h> +#include <asm/mman.h> +#include <asm/mmu.h> +#include <asm/copro.h> +#include <asm/hugetlb.h> +#include <asm/mmu_context.h> + +static DEFINE_SPINLOCK(slice_convert_lock); + +#ifdef DEBUG +int _slice_debug = 1; + +static void slice_print_mask(const char *label, const struct slice_mask *mask) +{ + if (!_slice_debug) + return; + pr_devel("%s low_slice: %*pbl\n", label, + (int)SLICE_NUM_LOW, &mask->low_slices); + pr_devel("%s high_slice: %*pbl\n", label, + (int)SLICE_NUM_HIGH, mask->high_slices); +} + +#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) + +#else + +static void slice_print_mask(const char *label, const struct slice_mask *mask) {} +#define slice_dbg(fmt...) + +#endif + +static inline notrace bool slice_addr_is_low(unsigned long addr) +{ + u64 tmp = (u64)addr; + + return tmp < SLICE_LOW_TOP; +} + +static void slice_range_to_mask(unsigned long start, unsigned long len, + struct slice_mask *ret) +{ + unsigned long end = start + len - 1; + + ret->low_slices = 0; + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); + + ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); + } + + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; + + bitmap_set(ret->high_slices, start_index, count); + } +} + +static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + + if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr) + return 0; + vma = find_vma(mm, addr); + return (!vma || (addr + len) <= vm_start_gap(vma)); +} + +static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) +{ + return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT, + 1ul << SLICE_LOW_SHIFT); +} + +static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) +{ + unsigned long start = slice << SLICE_HIGH_SHIFT; + unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); + + /* Hack, so that each addresses is controlled by exactly one + * of the high or low area bitmaps, the first high area starts + * at 4GB, not 0 */ + if (start == 0) + start = (unsigned long)SLICE_LOW_TOP; + + return !slice_area_is_free(mm, start, end - start); +} + +static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, + unsigned long high_limit) +{ + unsigned long i; + + ret->low_slices = 0; + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + + for (i = 0; i < SLICE_NUM_LOW; i++) + if (!slice_low_has_vma(mm, i)) + ret->low_slices |= 1u << i; + + if (slice_addr_is_low(high_limit - 1)) + return; + + for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) + if (!slice_high_has_vma(mm, i)) + __set_bit(i, ret->high_slices); +} + +static bool slice_check_range_fits(struct mm_struct *mm, + const struct slice_mask *available, + unsigned long start, unsigned long len) +{ + unsigned long end = start + len - 1; + u64 low_slices = 0; + + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); + + low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); + } + if ((low_slices & available->low_slices) != low_slices) + return false; + + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; + unsigned long i; + + for (i = start_index; i < start_index + count; i++) { + if (!test_bit(i, available->high_slices)) + return false; + } + } + + return true; +} + +static void slice_flush_segments(void *parm) +{ +#ifdef CONFIG_PPC64 + struct mm_struct *mm = parm; + unsigned long flags; + + if (mm != current->active_mm) + return; + + copy_mm_to_paca(current->active_mm); + + local_irq_save(flags); + slb_flush_and_restore_bolted(); + local_irq_restore(flags); +#endif +} + +static void slice_convert(struct mm_struct *mm, + const struct slice_mask *mask, int psize) +{ + int index, mask_index; + /* Write the new slice psize bits */ + unsigned char *hpsizes, *lpsizes; + struct slice_mask *psize_mask, *old_mask; + unsigned long i, flags; + int old_psize; + + slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); + slice_print_mask(" mask", mask); + + psize_mask = slice_mask_for_size(&mm->context, psize); + + /* We need to use a spinlock here to protect against + * concurrent 64k -> 4k demotion ... + */ + spin_lock_irqsave(&slice_convert_lock, flags); + + lpsizes = mm_ctx_low_slices(&mm->context); + for (i = 0; i < SLICE_NUM_LOW; i++) { + if (!(mask->low_slices & (1u << i))) + continue; + + mask_index = i & 0x1; + index = i >> 1; + + /* Update the slice_mask */ + old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(&mm->context, old_psize); + old_mask->low_slices &= ~(1u << i); + psize_mask->low_slices |= 1u << i; + + /* Update the sizes array */ + lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + hpsizes = mm_ctx_high_slices(&mm->context); + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) { + if (!test_bit(i, mask->high_slices)) + continue; + + mask_index = i & 0x1; + index = i >> 1; + + /* Update the slice_mask */ + old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(&mm->context, old_psize); + __clear_bit(i, old_mask->high_slices); + __set_bit(i, psize_mask->high_slices); + + /* Update the sizes array */ + hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + slice_dbg(" lsps=%lx, hsps=%lx\n", + (unsigned long)mm_ctx_low_slices(&mm->context), + (unsigned long)mm_ctx_high_slices(&mm->context)); + + spin_unlock_irqrestore(&slice_convert_lock, flags); + + copro_flush_all_slbs(mm); +} + +/* + * Compute which slice addr is part of; + * set *boundary_addr to the start or end boundary of that slice + * (depending on 'end' parameter); + * return boolean indicating if the slice is marked as available in the + * 'available' slice_mark. + */ +static bool slice_scan_available(unsigned long addr, + const struct slice_mask *available, + int end, unsigned long *boundary_addr) +{ + unsigned long slice; + if (slice_addr_is_low(addr)) { + slice = GET_LOW_SLICE_INDEX(addr); + *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; + return !!(available->low_slices & (1u << slice)); + } else { + slice = GET_HIGH_SLICE_INDEX(addr); + *boundary_addr = (slice + end) ? + ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; + return !!test_bit(slice, available->high_slices); + } +} + +static unsigned long slice_find_area_bottomup(struct mm_struct *mm, + unsigned long addr, unsigned long len, + const struct slice_mask *available, + int psize, unsigned long high_limit) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long found, next_end; + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + /* + * Check till the allow max value for this mmap request + */ + while (addr < high_limit) { + info.low_limit = addr; + if (!slice_scan_available(addr, available, 1, &addr)) + continue; + + next_slice: + /* + * At this point [info.low_limit; addr) covers + * available slices only and ends at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the next available slice. + */ + if (addr >= high_limit) + addr = high_limit; + else if (slice_scan_available(addr, available, 1, &next_end)) { + addr = next_end; + goto next_slice; + } + info.high_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + return -ENOMEM; +} + +static unsigned long slice_find_area_topdown(struct mm_struct *mm, + unsigned long addr, unsigned long len, + const struct slice_mask *available, + int psize, unsigned long high_limit) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long found, prev; + struct vm_unmapped_area_info info; + unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + /* + * If we are trying to allocate above DEFAULT_MAP_WINDOW + * Add the different to the mmap_base. + * Only for that request for which high_limit is above + * DEFAULT_MAP_WINDOW we should apply this. + */ + if (high_limit > DEFAULT_MAP_WINDOW) + addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW; + + while (addr > min_addr) { + info.high_limit = addr; + if (!slice_scan_available(addr - 1, available, 0, &addr)) + continue; + + prev_slice: + /* + * At this point [addr; info.high_limit) covers + * available slices only and starts at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the previous available slice. + */ + if (addr < min_addr) + addr = min_addr; + else if (slice_scan_available(addr - 1, available, 0, &prev)) { + addr = prev; + goto prev_slice; + } + info.low_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + return slice_find_area_bottomup(mm, TASK_UNMAPPED_BASE, len, available, psize, high_limit); +} + + +static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, + const struct slice_mask *mask, int psize, + int topdown, unsigned long high_limit) +{ + if (topdown) + return slice_find_area_topdown(mm, mm->mmap_base, len, mask, psize, high_limit); + else + return slice_find_area_bottomup(mm, mm->mmap_base, len, mask, psize, high_limit); +} + +static inline void slice_copy_mask(struct slice_mask *dst, + const struct slice_mask *src) +{ + dst->low_slices = src->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH); +} + +static inline void slice_or_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) +{ + dst->low_slices = src1->low_slices | src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); +} + +static inline void slice_andnot_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) +{ + dst->low_slices = src1->low_slices & ~src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); +} + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_PAGE_BASE MMU_PAGE_64K +#else +#define MMU_PAGE_BASE MMU_PAGE_4K +#endif + +unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, + unsigned long flags, unsigned int psize, + int topdown) +{ + struct slice_mask good_mask; + struct slice_mask potential_mask; + const struct slice_mask *maskp; + const struct slice_mask *compat_maskp = NULL; + int fixed = (flags & MAP_FIXED); + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long page_size = 1UL << pshift; + struct mm_struct *mm = current->mm; + unsigned long newaddr; + unsigned long high_limit; + + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len > high_limit) + return -ENOMEM; + if (len & (page_size - 1)) + return -EINVAL; + if (fixed) { + if (addr & (page_size - 1)) + return -EINVAL; + if (addr > high_limit - len) + return -ENOMEM; + } + + if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) { + /* + * Increasing the slb_addr_limit does not require + * slice mask cache to be recalculated because it should + * be already initialised beyond the old address limit. + */ + mm_ctx_set_slb_addr_limit(&mm->context, high_limit); + + on_each_cpu(slice_flush_segments, mm, 1); + } + + /* Sanity checks */ + BUG_ON(mm->task_size == 0); + BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0); + VM_BUG_ON(radix_enabled()); + + slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); + slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", + addr, len, flags, topdown); + + /* If hint, make sure it matches our alignment restrictions */ + if (!fixed && addr) { + addr = ALIGN(addr, page_size); + slice_dbg(" aligned addr=%lx\n", addr); + /* Ignore hint if it's too large or overlaps a VMA */ + if (addr > high_limit - len || addr < mmap_min_addr || + !slice_area_is_free(mm, addr, len)) + addr = 0; + } + + /* First make up a "good" mask of slices that have the right size + * already + */ + maskp = slice_mask_for_size(&mm->context, psize); + + /* + * Here "good" means slices that are already the right page size, + * "compat" means slices that have a compatible page size (i.e. + * 4k in a 64k pagesize kernel), and "free" means slices without + * any VMAs. + * + * If MAP_FIXED: + * check if fits in good | compat => OK + * check if fits in good | compat | free => convert free + * else bad + * If have hint: + * check if hint fits in good => OK + * check if hint fits in good | free => convert free + * Otherwise: + * search in good, found => OK + * search in good | free, found => convert free + * search in good | compat | free, found => convert free. + */ + + /* + * If we support combo pages, we can allow 64k pages in 4k slices + * The mask copies could be avoided in most cases here if we had + * a pointer to good mask for the next code to use. + */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); + if (fixed) + slice_or_mask(&good_mask, maskp, compat_maskp); + else + slice_copy_mask(&good_mask, maskp); + } else { + slice_copy_mask(&good_mask, maskp); + } + + slice_print_mask(" good_mask", &good_mask); + if (compat_maskp) + slice_print_mask(" compat_mask", compat_maskp); + + /* First check hint if it's valid or if we have MAP_FIXED */ + if (addr != 0 || fixed) { + /* Check if we fit in the good mask. If we do, we just return, + * nothing else to do + */ + if (slice_check_range_fits(mm, &good_mask, addr, len)) { + slice_dbg(" fits good !\n"); + newaddr = addr; + goto return_addr; + } + } else { + /* Now let's see if we can find something in the existing + * slices for that size + */ + newaddr = slice_find_area(mm, len, &good_mask, + psize, topdown, high_limit); + if (newaddr != -ENOMEM) { + /* Found within the good mask, we don't have to setup, + * we thus return directly + */ + slice_dbg(" found area at 0x%lx\n", newaddr); + goto return_addr; + } + } + /* + * We don't fit in the good mask, check what other slices are + * empty and thus can be converted + */ + slice_mask_for_free(mm, &potential_mask, high_limit); + slice_or_mask(&potential_mask, &potential_mask, &good_mask); + slice_print_mask(" potential", &potential_mask); + + if (addr != 0 || fixed) { + if (slice_check_range_fits(mm, &potential_mask, addr, len)) { + slice_dbg(" fits potential !\n"); + newaddr = addr; + goto convert; + } + } + + /* If we have MAP_FIXED and failed the above steps, then error out */ + if (fixed) + return -EBUSY; + + slice_dbg(" search...\n"); + + /* If we had a hint that didn't work out, see if we can fit + * anywhere in the good area. + */ + if (addr) { + newaddr = slice_find_area(mm, len, &good_mask, + psize, topdown, high_limit); + if (newaddr != -ENOMEM) { + slice_dbg(" found area at 0x%lx\n", newaddr); + goto return_addr; + } + } + + /* Now let's see if we can find something in the existing slices + * for that size plus free slices + */ + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); + + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM && + psize == MMU_PAGE_64K) { + /* retry the search with 4k-page slices included */ + slice_or_mask(&potential_mask, &potential_mask, compat_maskp); + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); + } + + if (newaddr == -ENOMEM) + return -ENOMEM; + + slice_range_to_mask(newaddr, len, &potential_mask); + slice_dbg(" found potential area at 0x%lx\n", newaddr); + slice_print_mask(" mask", &potential_mask); + + convert: + /* + * Try to allocate the context before we do slice convert + * so that we handle the context allocation failure gracefully. + */ + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + + slice_andnot_mask(&potential_mask, &potential_mask, &good_mask); + if (compat_maskp && !fixed) + slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp); + if (potential_mask.low_slices || + (SLICE_NUM_HIGH && + !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) { + slice_convert(mm, &potential_mask, psize); + if (psize > MMU_PAGE_BASE) + on_each_cpu(slice_flush_segments, mm, 1); + } + return newaddr; + +return_addr: + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + return newaddr; +} +EXPORT_SYMBOL_GPL(slice_get_unmapped_area); + +unsigned long arch_get_unmapped_area(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + if (radix_enabled()) + return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + + return slice_get_unmapped_area(addr, len, flags, + mm_ctx_user_psize(¤t->mm->context), 0); +} + +unsigned long arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + if (radix_enabled()) + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags); + + return slice_get_unmapped_area(addr0, len, flags, + mm_ctx_user_psize(¤t->mm->context), 1); +} + +unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) +{ + unsigned char *psizes; + int index, mask_index; + + VM_BUG_ON(radix_enabled()); + + if (slice_addr_is_low(addr)) { + psizes = mm_ctx_low_slices(&mm->context); + index = GET_LOW_SLICE_INDEX(addr); + } else { + psizes = mm_ctx_high_slices(&mm->context); + index = GET_HIGH_SLICE_INDEX(addr); + } + mask_index = index & 0x1; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xf; +} +EXPORT_SYMBOL_GPL(get_slice_psize); + +void slice_init_new_context_exec(struct mm_struct *mm) +{ + unsigned char *hpsizes, *lpsizes; + struct slice_mask *mask; + unsigned int psize = mmu_virtual_psize; + + slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm); + + /* + * In the case of exec, use the default limit. In the + * case of fork it is just inherited from the mm being + * duplicated. + */ + mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT); + mm_ctx_set_user_psize(&mm->context, psize); + + /* + * Set all slice psizes to the default. + */ + lpsizes = mm_ctx_low_slices(&mm->context); + memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); + + hpsizes = mm_ctx_high_slices(&mm->context); + memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); + + /* + * Slice mask cache starts zeroed, fill the default size cache. + */ + mask = slice_mask_for_size(&mm->context, psize); + mask->low_slices = ~0UL; + if (SLICE_NUM_HIGH) + bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); +} + +void slice_setup_new_exec(void) +{ + struct mm_struct *mm = current->mm; + + slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); + + if (!is_32bit_task()) + return; + + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); +} + +void slice_set_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long len, unsigned int psize) +{ + struct slice_mask mask; + + VM_BUG_ON(radix_enabled()); + + slice_range_to_mask(start, len, &mask); + slice_convert(mm, &mask, psize); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * is_hugepage_only_range() is used by generic code to verify whether + * a normal mmap mapping (non hugetlbfs) is valid on a given area. + * + * until the generic code provides a more generic hook and/or starts + * calling arch get_unmapped_area for MAP_FIXED (which our implementation + * here knows how to deal with), we hijack it to keep standard mappings + * away from us. + * + * because of that generic code limitation, MAP_FIXED mapping cannot + * "convert" back a slice with no VMAs to the standard page size, only + * get_unmapped_area() can. It would be possible to fix it here but I + * prefer working on fixing the generic code instead. + * + * WARNING: This will not work if hugetlbfs isn't enabled since the + * generic code will redefine that function as 0 in that. This is ok + * for now as we only use slices with hugetlbfs enabled. This should + * be fixed as the generic code gets fixed. + */ +int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + const struct slice_mask *maskp; + unsigned int psize = mm_ctx_user_psize(&mm->context); + + VM_BUG_ON(radix_enabled()); + + maskp = slice_mask_for_size(&mm->context, psize); + + /* We need to account for 4k slices too */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { + const struct slice_mask *compat_maskp; + struct slice_mask available; + + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); + slice_or_mask(&available, maskp, compat_maskp); + return !slice_check_range_fits(mm, &available, addr, len); + } + + return !slice_check_range_fits(mm, maskp, addr, len); +} + +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + /* With radix we don't use slice, so derive it from vma*/ + if (radix_enabled()) + return vma_kernel_pagesize(vma); + + return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); +} + +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + return shift_to_mmu_psize(huge_page_shift(hstate)); +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + if (radix_enabled()) + return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); + + return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); +} +#endif diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 2ef24a53f4c9..ec98e526167e 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -11,7 +11,7 @@ #include <linux/hugetlb.h> #include <linux/syscalls.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <linux/uaccess.h> /* @@ -54,21 +54,25 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) return; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return; arch_enter_lazy_mmu_mode(); for (; npages > 0; --npages) { pte_update(mm, addr, pte, 0, 0, 0); @@ -92,7 +96,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) size_t nw; unsigned long next, limit; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); spt = mm_ctx_subpage_prot(&mm->context); if (!spt) @@ -127,7 +131,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) } err_out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -141,30 +145,22 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops subpage_walk_ops = { .pmd_entry = subpage_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; - vma->vm_flags |= VM_NOHUGEPAGE; + for_each_vma_range(vmi, vma, addr + len) { + vm_flags_set(vma, VM_NOHUGEPAGE); walk_page_vma(vma, &subpage_walk_ops, NULL); - vma = vma->vm_next; } } #else @@ -217,13 +213,13 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) return -EFAULT; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); spt = mm_ctx_subpage_prot(&mm->context); if (!spt) { /* * Allocate subpage prot table if not already done. - * Do this with mmap_sem held + * Do this with mmap_lock held */ spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); if (!spt) { @@ -267,11 +263,11 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, if (addr + (nw << PAGE_SHIFT) > next) nw = (next - addr) >> PAGE_SHIFT; - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); if (__copy_from_user(spp, map, nw * sizeof(u32))) return -EFAULT; map += nw; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); /* now flush any existing HPTEs for the range */ hpte_flush_range(mm, addr, nw); @@ -280,6 +276,6 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, spt->maxaddr = limit; err = 0; out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return err; } diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c new file mode 100644 index 000000000000..ccd64b5e6cac --- /dev/null +++ b/arch/powerpc/mm/book3s64/trace.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file is for defining trace points and trace related helpers. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#include <trace/events/thp.h> +#endif |