diff options
Diffstat (limited to 'arch/powerpc/mm')
80 files changed, 4638 insertions, 4350 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index eae4ec2988fc..8c1582b2987d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -3,9 +3,7 @@ # Makefile for the linux ppc-specific parts of the memory manager. # -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - -obj-y := fault.o mem.o pgtable.o mmap.o maccess.o pageattr.o \ +obj-y := fault.o mem.o pgtable.o maccess.o pageattr.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o ioremap.o ioremap_$(BITS).o \ init-common.o mmu_context.o drmem.o \ @@ -14,9 +12,8 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_PPC_PTDUMP) += ptdump/ +obj-$(CONFIG_PTDUMP) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile index 15f4773643d2..50dd8f6bdf46 100644 --- a/arch/powerpc/mm/book3s32/Makefile +++ b/arch/powerpc/mm/book3s32/Makefile @@ -9,5 +9,4 @@ endif obj-y += mmu.o mmu_context.o obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o -obj-$(CONFIG_PPC_KUEP) += kuep.o obj-$(CONFIG_PPC_KUAP) += kuap.o diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 6925ce998557..4ed0efd03db5 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -14,6 +14,7 @@ * hash table, so this file is not used on them.) */ +#include <linux/export.h> #include <linux/pgtable.h> #include <linux/init.h> #include <asm/reg.h> @@ -22,7 +23,6 @@ #include <asm/ppc_asm.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> -#include <asm/export.h> #include <asm/feature-fixups.h> #include <asm/code-patching-asm.h> @@ -36,8 +36,9 @@ /* * Load a PTE into the hash table, if possible. - * The address is in r4, and r3 contains an access flag: - * _PAGE_RW (0x400) if a write. + * The address is in r4, and r3 contains required access flags: + * - For ISI: _PAGE_PRESENT | _PAGE_EXEC + * - For DSI: _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. * SPRG_THREAD contains the physical address of the current task's thread. * @@ -67,12 +68,16 @@ _GLOBAL(hash_page) lis r0, TASK_SIZE@h /* check if kernel address */ cmplw 0,r4,r0 mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ lis r5,swapper_pg_dir@ha /* if kernel address, use */ + andi. r0,r9,MSR_PR /* Check usermode */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ - rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ +#ifdef CONFIG_SMP + bne- .Lhash_page_out /* return if usermode */ +#else + bnelr- +#endif 112: tophys(r5, r5) #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ @@ -113,15 +118,15 @@ _GLOBAL(hash_page) lwarx r6,0,r8 /* get linux-style pte, flag word */ #ifdef CONFIG_PPC_KUAP mfsrin r5,r4 - rlwinm r0,r9,28,_PAGE_RW /* MSR[PR] => _PAGE_RW */ - rlwinm r5,r5,12,_PAGE_RW /* Ks => _PAGE_RW */ + rlwinm r0,r9,28,_PAGE_WRITE /* MSR[PR] => _PAGE_WRITE */ + rlwinm r5,r5,12,_PAGE_WRITE /* Ks => _PAGE_WRITE */ andc r5,r5,r0 /* Ks & ~MSR[PR] */ - andc r5,r6,r5 /* Clear _PAGE_RW when Ks = 1 && MSR[PR] = 0 */ + andc r5,r6,r5 /* Clear _PAGE_WRITE when Ks = 1 && MSR[PR] = 0 */ andc. r5,r3,r5 /* check access & ~permission */ #else andc. r5,r3,r6 /* check access & ~permission */ #endif - rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ + rlwinm r0,r3,32-3,24,24 /* _PAGE_WRITE access -> _PAGE_DIRTY */ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE #ifdef CONFIG_SMP bne- .Lhash_page_out /* return if access not permitted */ @@ -199,12 +204,12 @@ _GLOBAL(add_hash_page) lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l 10: lwarx r0,0,r6 /* take the mmu_hash_lock */ - cmpi 0,r0,0 + cmpwi 0,r0,0 bne- 11f stwcx. r8,0,r6 beq+ 12f 11: lwz r0,0(r6) - cmpi 0,r0,0 + cmpwi 0,r0,0 beq 10b b 11b 12: isync @@ -307,12 +312,15 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) __REF _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ + lis r0, TASK_SIZE@h + rlwinm r5,r5,0,~3 /* Clear PP bits */ + cmplw r4,r0 + rlwinm r8,r5,32-9,30,30 /* _PAGE_WRITE -> PP msb */ rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ - rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ - ori r8,r8,0xe04 /* clear out reserved bits */ + bge- 1f /* Kernelspace ? Skip */ + ori r5,r5,3 /* Userspace ? PP = 3 */ +1: ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ @@ -512,12 +520,12 @@ _GLOBAL(flush_hash_pages) lwz r8, TASK_CPU(r8) oris r8,r8,9 10: lwarx r0,0,r9 - cmpi 0,r0,0 + cmpwi 0,r0,0 bne- 11f stwcx. r8,0,r9 beq+ 12f 11: lwz r0,0(r9) - cmpi 0,r0,0 + cmpwi 0,r0,0 beq 10b b 11b 12: isync diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c index 0f920f09af57..3a8815555a48 100644 --- a/arch/powerpc/mm/book3s32/kuap.c +++ b/arch/powerpc/mm/book3s32/kuap.c @@ -3,31 +3,20 @@ #include <asm/kup.h> #include <asm/smp.h> -struct static_key_false disable_kuap_key; -EXPORT_SYMBOL(disable_kuap_key); - -void kuap_lock_all_ool(void) -{ - kuap_lock_all(); -} -EXPORT_SYMBOL(kuap_lock_all_ool); - -void kuap_unlock_all_ool(void) -{ - kuap_unlock_all(); -} -EXPORT_SYMBOL(kuap_unlock_all_ool); - void setup_kuap(bool disabled) { - if (!disabled) - kuap_lock_all_ool(); + if (!disabled) { + update_user_segments(mfsr(0) | SR_KS); + isync(); /* Context sync required after mtsr() */ + init_mm.context.sr0 |= SR_KS; + current->thread.sr0 |= SR_KS; + } if (smp_processor_id() != boot_cpuid) return; if (disabled) - static_branch_enable(&disable_kuap_key); + cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP; else pr_info("Activating Kernel Userspace Access Protection\n"); } diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c deleted file mode 100644 index c20733d6e02c..000000000000 --- a/arch/powerpc/mm/book3s32/kuep.c +++ /dev/null @@ -1,20 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <asm/kup.h> -#include <asm/smp.h> - -struct static_key_false disable_kuep_key; - -void setup_kuep(bool disabled) -{ - if (!disabled) - kuep_lock(); - - if (smp_processor_id() != boot_cpuid) - return; - - if (disabled) - static_branch_enable(&disable_kuep_key); - else - pr_info("Activating Kernel Userspace Execution Prevention\n"); -} diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 27061583a010..c42ecdf94e48 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -23,10 +23,9 @@ #include <linux/highmem.h> #include <linux/memblock.h> -#include <asm/prom.h> #include <asm/mmu.h> #include <asm/machdep.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/sections.h> #include <mm/mmu_decl.h> @@ -76,7 +75,7 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -static int find_free_bat(void) +int __init find_free_bat(void) { int b; int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; @@ -100,7 +99,7 @@ static int find_free_bat(void) * - block size has to be a power of two. This is calculated by finding the * highest bit set to 1. */ -static unsigned int block_size(unsigned long base, unsigned long top) +unsigned int bat_block_size(unsigned long base, unsigned long top) { unsigned int max_size = SZ_256M; unsigned int base_shift = (ffs(base) - 1) & 31; @@ -128,7 +127,7 @@ static void setibat(int index, unsigned long virt, phys_addr_t phys, wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) + if (!is_kernel_addr(virt)) bat[0].batu |= 1; /* Vp = 1 */ } @@ -145,7 +144,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to int idx; while ((idx = find_free_bat()) != -1 && base != top) { - unsigned int size = block_size(base, top); + unsigned int size = bat_block_size(base, top); if (size < 128 << 10) break; @@ -159,10 +158,13 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long done; - unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; + unsigned long border = (unsigned long)__srwx_boundary - PAGE_OFFSET; + unsigned long size; + size = roundup_pow_of_two((unsigned long)_einittext - PAGE_OFFSET); + setibat(0, PAGE_OFFSET, 0, size, PAGE_KERNEL_X); - if (debug_pagealloc_enabled_or_kfence() || __map_without_bats) { + if (debug_pagealloc_enabled_or_kfence()) { pr_debug_once("Read-Write memory mapped without BATs\n"); if (base >= border) return base; @@ -182,7 +184,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) static bool is_module_segment(unsigned long addr) { - if (!IS_ENABLED(CONFIG_MODULES)) + if (!IS_ENABLED(CONFIG_EXECMEM)) return false; if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M)) return false; @@ -191,30 +193,29 @@ static bool is_module_segment(unsigned long addr) return true; } -void mmu_mark_initmem_nx(void) +int mmu_mark_initmem_nx(void) { int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; unsigned long base = (unsigned long)_stext - PAGE_OFFSET; - unsigned long top = (unsigned long)_etext - PAGE_OFFSET; + unsigned long top = ALIGN((unsigned long)_etext - PAGE_OFFSET, SZ_128K); unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; unsigned long size; - for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { - size = block_size(base, top); - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + for (i = 0; i < nb - 1 && base < top;) { + size = bat_block_size(base, top); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); base += size; } if (base < top) { - size = block_size(base, top); - size = max(size, 128UL << 10); + size = bat_block_size(base, top); if ((top - base) > size) { size <<= 1; if (strict_kernel_rwx_enabled() && base + size > border) pr_warn("Some RW data is getting mapped X. " "Adjust CONFIG_DATA_SHIFT to avoid that.\n"); } - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); base += size; } for (; i < nb; i++) @@ -222,6 +223,8 @@ void mmu_mark_initmem_nx(void) update_bats(); + BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, SZ_256M) < TASK_SIZE); + for (i = TASK_SIZE >> 28; i < 16; i++) { /* Do not set NX on VM space for modules */ if (is_module_segment(i << 28)) @@ -229,9 +232,10 @@ void mmu_mark_initmem_nx(void) mtsr(mfsr(i << 28) | 0x10000000, i << 28); } + return 0; } -void mmu_mark_rodata_ro(void) +int mmu_mark_rodata_ro(void) { int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; @@ -239,18 +243,19 @@ void mmu_mark_rodata_ro(void) for (i = 0; i < nb; i++) { struct ppc_bat *bat = BATS[i]; - if (bat_addrs[i].start < (unsigned long)__init_begin) + if (bat_addrs[i].start < (unsigned long)__end_rodata) bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; } update_bats(); + + return 0; } /* - * Set up one of the I/D BAT (block address translation) register pairs. + * Set up one of the D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. - * On 603+, only set IBAT when _PAGE_EXEC is set */ void __init setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot) @@ -277,19 +282,15 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, /* Do DBAT first */ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT | _PAGE_GUARDED); - wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + wimgxpp |= (flags & _PAGE_WRITE) ? BPP_RW : BPP_RX; bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) + if (!is_kernel_addr(virt)) bat[1].batu |= 1; /* Vp = 1 */ if (flags & _PAGE_GUARDED) { /* G bit must be zero in IBATs */ flags &= ~_PAGE_EXEC; } - if (flags & _PAGE_EXEC) - bat[0] = bat[1]; - else - bat[0].batu = bat[0].batl = 0; bat_addrs[index].start = virt; bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; @@ -318,11 +319,9 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return; /* * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held @@ -378,10 +377,7 @@ void __init MMU_init_hw(void) * Find some memory for the hash table. */ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = memblock_alloc(Hash_size, Hash_size); - if (!Hash) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, Hash_size, Hash_size); + Hash = memblock_alloc_or_panic(Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; pr_info("Total memory = %lldMB; using %ldkB for hash table\n", diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c index e2708e387dc3..1922f9a6b058 100644 --- a/arch/powerpc/mm/book3s32/mmu_context.c +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -69,6 +69,12 @@ EXPORT_SYMBOL_GPL(__init_new_context); int init_new_context(struct task_struct *t, struct mm_struct *mm) { mm->context.id = __init_new_context(); + mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0); + + if (IS_ENABLED(CONFIG_PPC_KUEP)) + mm->context.sr0 |= SR_NX; + if (!kuap_is_disabled()) + mm->context.sr0 |= SR_KS; return 0; } @@ -108,20 +114,13 @@ void __init mmu_context_init(void) void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { long id = next->context.id; - unsigned long val; if (id < 0) panic("mm_struct %p has no context ID", next); isync(); - val = CTX_TO_VSID(id, 0); - if (!kuep_is_disabled()) - val |= SR_NX; - if (!kuap_is_disabled()) - val |= SR_KS; - - update_user_segments(val); + update_user_segments(next->context.sr0); if (IS_ENABLED(CONFIG_BDI_SWITCH)) abatron_pteptrs[1] = next->pgd; diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 19f0ef950d77..e54a7b011232 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range); void hash__flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *mp; + VMA_ITERATOR(vmi, mm, 0); /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_lock. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. + * It is safe to iterate the vmas when called from dup_mmap, + * holding mmap_lock. It would also be safe from unmap_region + * or exit_mmap, but not from vmtruncate on SMP - but it seems + * dup_mmap is the only SMP case which gets here. */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + for_each_vma(vmi, mp) hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); } EXPORT_SYMBOL(hash__flush_tlb_mm); @@ -104,3 +105,12 @@ void hash__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); } EXPORT_SYMBOL(hash__flush_tlb_page); + +void hash__flush_gather(struct mmu_gather *tlb) +{ + if (tlb->fullmm || tlb->need_flush_all) + hash__flush_tlb_mm(tlb->mm); + else + hash__flush_range(tlb->mm, tlb->start, tlb->end); +} +EXPORT_SYMBOL(hash__flush_gather); diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 1b56d3af47d4..33af5795856a 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -1,23 +1,33 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := $(NO_MINIMAL_TOC) - +obj-y += mmu_context.o pgtable.o trace.o +ifdef CONFIG_PPC_64S_HASH_MMU CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - -obj-y += hash_pgtable.o hash_utils.o slb.o \ - mmu_context.o pgtable.o hash_tlb.o -obj-$(CONFIG_PPC_NATIVE) += hash_native.o -obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o +obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o -obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +endif + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o obj-$(CONFIG_PPC_PKEY) += pkeys.o # Instrumenting the SLB fault path can lead to duplicate SLB entries KCOV_INSTRUMENT_slb.o := n + +# Parts of these can run in real mode and therefore are +# not safe with the current outline KASAN implementation +KASAN_SANITIZE_mmu_context.o := n +KASAN_SANITIZE_pgtable.o := n +KASAN_SANITIZE_radix_pgtable.o := n +KASAN_SANITIZE_radix_tlb.o := n +KASAN_SANITIZE_slb.o := n +KASAN_SANITIZE_pkeys.o := n diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c index 7de1a8a0c62a..02acbfd05b46 100644 --- a/arch/powerpc/mm/book3s64/hash_4k.c +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize, int subpg_prot) @@ -118,6 +120,9 @@ repeat: } new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c index 998c6817ed47..954af420f358 100644 --- a/arch/powerpc/mm/book3s64/hash_64k.c +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + /* * Return true, if the entry has a slot value which * the software considers as invalid. @@ -216,6 +218,9 @@ repeat: new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); new_pte |= H_PAGE_HASHPTE; + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -327,7 +332,12 @@ repeat: new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; } diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c index c0fabe6c5a12..cdfd4fe75edb 100644 --- a/arch/powerpc/mm/book3s64/hash_hugepage.c +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -54,21 +54,18 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, /* * Make sure this is thp or devmap entry */ - if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) + if (!(old_pmd & H_PAGE_THP_HUGE)) return 0; rflags = htab_convert_pte_flags(new_pmd, flags); -#if 0 - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + /* + * THPs are only supported on platforms that can do mixed page size + * segments (MPSS) and all such platforms have coherent icache. Hence we + * don't need to do lazy icache flush (hash_page_do_lazy_icache()) on + * noexecute fault. + */ - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - } -#endif /* * Find the slot index details for this ea, using base page size. */ diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 52e170bd95ae..e9e2dd70c060 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -27,8 +27,6 @@ #include <asm/ppc-opcode.h> #include <asm/feature-fixups.h> -#include <misc/cxl-base.h> - #ifdef DEBUG_LOW #define DBG_LOW(fmt...) udbg_printf(fmt) #else @@ -43,109 +41,28 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) -{ - unsigned long rb; +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - - asm volatile("tlbiel %0" : : "r" (rb)); -} - -/* - * tlbiel instruction for hash, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) +static void acquire_hpte_lock(void) { - unsigned long rb; - unsigned long rs; - unsigned int r = 0; /* hash format */ - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) - : "memory"); + lock_map_acquire(&hpte_lock_map); } - -static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +static void release_hpte_lock(void) { - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa206(set, is); - - ppc_after_tlbiel_barrier(); + lock_map_release(&hpte_lock_map); } - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +#else +static void acquire_hpte_lock(void) { - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the partition table cache if this is HV mode. - */ - if (early_cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - - /* - * Now invalidate the process table cache. UPRT=0 HPT modes (what - * current hardware implements) do not use the process table, but - * add the flushes anyway. - * - * From ISA v3.0B p. 1078: - * The following forms are invalid. - * * PRS=1, R=0, and RIC!=2 (The only process-scoped - * HPT caching is of the Process Table.) - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 1); - - /* - * Then flush the sets of the TLB proper. Hash mode uses - * partition scoped TLB translations, which may be flushed - * in !HV mode. - */ - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); - - ppc_after_tlbiel_barrier(); - - asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } -void hash__tlbiel_all(unsigned int action) +static void release_hpte_lock(void) { - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) - tlbiel_all_isa206(POWER8_TLB_SETS, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) - tlbiel_all_isa206(POWER7_TLB_SETS, is); - else - WARN(1, "%s called on pre-POWER7 CPU\n", __func__); } +#endif static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) @@ -267,7 +184,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) va |= ssize << 8; sllp = get_sllp_encoding(apsize); va |= sllp << 5; - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; @@ -286,7 +203,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) */ va |= (vpn & 0xfe); va |= 1; /* L */ - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; @@ -298,11 +215,9 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) static inline void tlbie(unsigned long vpn, int psize, int apsize, int ssize, int local) { - unsigned int use_local; + unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use(); - if (use_local) use_local = mmu_psize_defs[psize].tlbiel; if (lock_tlbie && !use_local) @@ -324,6 +239,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -338,6 +254,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -347,8 +264,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -367,8 +287,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -390,19 +312,24 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } static long native_hpte_remove(unsigned long hpte_group) { + unsigned long hpte_v, flags; struct hash_pte *hptep; int i; int slot_offset; - unsigned long hpte_v; + + local_irq_save(flags); DBG_LOW(" remove(group=%lx)\n", hpte_group); @@ -427,12 +354,16 @@ static long native_hpte_remove(unsigned long hpte_group) slot_offset &= 0x7; } - if (i == HPTES_PER_GROUP) - return -1; + if (i == HPTES_PER_GROUP) { + i = -1; + goto out; + } /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - +out: + local_irq_restore(flags); return i; } @@ -443,6 +374,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -486,6 +420,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -549,6 +485,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -567,6 +506,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -580,6 +521,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -597,6 +541,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } @@ -621,10 +568,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -684,10 +632,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -787,7 +733,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * TODO: add batching support when enabled. remember, no dynamic memory here, * although there is the control page available... */ -static void native_hpte_clear(void) +static notrace void native_hpte_clear(void) { unsigned long vpn = 0; unsigned long slot, slots; @@ -839,10 +785,6 @@ static void native_flush_hash_range(unsigned long number, int local) unsigned long psize = batch->psize; int ssize = batch->ssize; int i; - unsigned int use_local; - - use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && - mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use(); local_irq_save(flags); @@ -869,13 +811,16 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } - if (use_local) { + if (mmu_has_feature(MMU_FTR_TLBIEL) && + mmu_psize_defs[psize].tlbiel && local) { asm volatile("ptesync":::"memory"); for (i = 0; i < number; i++) { vpn = batch->vpn[i]; diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index ad5eff097d31..82d31177630b 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -13,10 +13,10 @@ #include <asm/sections.h> #include <asm/mmu.h> #include <asm/tlb.h> +#include <asm/firmware.h> #include <mm/mmu_decl.h> -#define CREATE_TRACE_POINTS #include <trace/events/thp.h> #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) @@ -195,7 +195,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr unsigned long old; #ifdef CONFIG_DEBUG_VM - WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + WARN_ON(!hash__pmd_trans_huge(*pmdp)); assert_spin_locked(pmd_lockptr(mm, pmdp)); #endif @@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr old = be64_to_cpu(old_be); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; @@ -227,7 +227,6 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); - VM_BUG_ON(pmd_devmap(*pmdp)); pmd = *pmdp; pmd_clear(pmdp); @@ -256,7 +255,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * the __collapse_huge_page_copy can result in copying * the old content. */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); return pmd; } @@ -378,7 +377,7 @@ int hash__has_transparent_hugepage(void) if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) return 0; /* - * We need to make sure that we support 16MB hugepage in a segement + * We need to make sure that we support 16MB hugepage in a segment * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE * of 64K. */ @@ -404,7 +403,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -478,7 +478,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -516,7 +517,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); @@ -541,7 +542,7 @@ void hash__mark_rodata_ro(void) unsigned long start, end, pp; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY); diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index eb0bccaf221e..21fcad97ae80 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -221,7 +221,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) local_irq_restore(flags); } -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) { pte_t *pte; pte_t *start_pte; @@ -239,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) local_irq_save(flags); arch_enter_lazy_mmu_mode(); start_pte = pte_offset_map(pmd, addr); + if (!start_pte) + goto out; for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } + pte_unmap(start_pte); +out: arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index ac5720371c0d..9dc5889d6ecb 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -36,26 +36,30 @@ #include <linux/hugetlb.h> #include <linux/cpu.h> #include <linux/pgtable.h> +#include <linux/debugfs.h> +#include <linux/random.h> +#include <linux/elf-randomize.h> +#include <linux/of_fdt.h> +#include <linux/kfence.h> -#include <asm/debugfs.h> #include <asm/interrupt.h> #include <asm/processor.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/page.h> +#include <asm/pgalloc.h> #include <asm/types.h> #include <linux/uaccess.h> #include <asm/machdep.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/eeh.h> #include <asm/tlb.h> #include <asm/cacheflush.h> #include <asm/cputable.h> #include <asm/sections.h> -#include <asm/copro.h> +#include <asm/spu.h> #include <asm/udbg.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/fadump.h> #include <asm/firmware.h> #include <asm/tm.h> @@ -64,6 +68,7 @@ #include <asm/pte-walk.h> #include <asm/asm-prototypes.h> #include <asm/ultravisor.h> +#include <asm/kfence.h> #include <mm/mmu_decl.h> @@ -99,8 +104,6 @@ */ static unsigned long _SDR1; -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -EXPORT_SYMBOL_GPL(mmu_psize_defs); u8 hpte_page_sizes[1 << LP_BITS]; EXPORT_SYMBOL_GPL(hpte_page_sizes); @@ -114,9 +117,6 @@ EXPORT_SYMBOL_GPL(mmu_linear_psize); int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; EXPORT_SYMBOL_GPL(mmu_vmalloc_psize); -#ifdef CONFIG_SPARSEMEM_VMEMMAP -int mmu_vmemmap_psize = MMU_PAGE_4K; -#endif int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; EXPORT_SYMBOL_GPL(mmu_kernel_ssize); @@ -126,12 +126,7 @@ EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif -#ifdef CONFIG_DEBUG_PAGEALLOC -static u8 *linear_map_hash_slots; -static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ -struct mmu_hash_ops mmu_hash_ops; +struct mmu_hash_ops mmu_hash_ops __ro_after_init; EXPORT_SYMBOL(mmu_hash_ops); /* @@ -175,6 +170,376 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = { }, }; +static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) +{ + unsigned long rb; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + + asm volatile("tlbiel %0" : : "r" (rb)); +} + +/* + * tlbiel instruction for hash, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + unsigned int r = 0; /* hash format */ + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) + : "memory"); +} + + +static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa206(set, is); + + ppc_after_tlbiel_barrier(); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the partition table cache if this is HV mode. + */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_hash_set_isa300(0, is, 0, 2, 0); + + /* + * Now invalidate the process table cache. UPRT=0 HPT modes (what + * current hardware implements) do not use the process table, but + * add the flushes anyway. + * + * From ISA v3.0B p. 1078: + * The following forms are invalid. + * * PRS=1, R=0, and RIC!=2 (The only process-scoped + * HPT caching is of the Process Table.) + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 1); + + /* + * Then flush the sets of the TLB proper. Hash mode uses + * partition scoped TLB translations, which may be flushed + * in !HV mode. + */ + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + + ppc_after_tlbiel_barrier(); + + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +void hash__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + tlbiel_all_isa206(POWER8_TLB_SETS, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) + tlbiel_all_isa206(POWER7_TLB_SETS, is); + else + WARN(1, "%s called on pre-POWER7 CPU\n", __func__); +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx, + u8 *slots, raw_spinlock_t *lock) +{ + unsigned long hash; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); + long ret; + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; + + if (slots[idx] & 0x80) + return; + + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, + HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); + + BUG_ON (ret < 0); + raw_spin_lock(lock); + BUG_ON(slots[idx] & 0x80); + slots[idx] = ret | 0x80; + raw_spin_unlock(lock); +} + +static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long idx, + u8 *slots, raw_spinlock_t *lock) +{ + unsigned long hash, hslot, slot; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + raw_spin_lock(lock); + if (!(slots[idx] & 0x80)) { + raw_spin_unlock(lock); + return; + } + hslot = slots[idx] & 0x7f; + slots[idx] = 0; + raw_spin_unlock(lock); + if (hslot & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hslot & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, + mmu_linear_psize, + mmu_kernel_ssize, 0); +} +#endif + +static inline bool hash_supports_debug_pagealloc(void) +{ + unsigned long max_hash_count = ppc64_rma_size / 4; + unsigned long linear_map_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + + if (!debug_pagealloc_enabled() || linear_map_count > max_hash_count) + return false; + return true; +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static u8 *linear_map_hash_slots; +static unsigned long linear_map_hash_count; +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); +static __init void hash_debug_pagealloc_alloc_slots(void) +{ + if (!hash_supports_debug_pagealloc()) + return; + + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = memblock_alloc_try_nid( + linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!linear_map_hash_slots) + panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", + __func__, linear_map_hash_count, &ppc64_rma_size); +} + +static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, + int slot) +{ + if (!debug_pagealloc_enabled() || !linear_map_hash_count) + return; + if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) + linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80; +} + +static int hash_debug_pagealloc_map_pages(struct page *page, int numpages, + int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + if (!debug_pagealloc_enabled() || !linear_map_hash_count) + return 0; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = __pa(vaddr) >> PAGE_SHIFT; + if (lmi >= linear_map_hash_count) + continue; + if (enable) + kernel_map_linear_page(vaddr, lmi, + linear_map_hash_slots, &linear_map_hash_lock); + else + kernel_unmap_linear_page(vaddr, lmi, + linear_map_hash_slots, &linear_map_hash_lock); + } + local_irq_restore(flags); + return 0; +} + +#else /* CONFIG_DEBUG_PAGEALLOC */ +static inline void hash_debug_pagealloc_alloc_slots(void) {} +static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) {} +static int __maybe_unused +hash_debug_pagealloc_map_pages(struct page *page, int numpages, int enable) +{ + return 0; +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +#ifdef CONFIG_KFENCE +static u8 *linear_map_kf_hash_slots; +static unsigned long linear_map_kf_hash_count; +static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock); + +static phys_addr_t kfence_pool; + +static __init void hash_kfence_alloc_pool(void) +{ + if (!kfence_early_init_enabled()) + goto err; + + /* allocate linear map for kfence within RMA region */ + linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT; + linear_map_kf_hash_slots = memblock_alloc_try_nid( + linear_map_kf_hash_count, 1, + MEMBLOCK_LOW_LIMIT, ppc64_rma_size, + NUMA_NO_NODE); + if (!linear_map_kf_hash_slots) { + pr_err("%s: memblock for linear map (%lu) failed\n", __func__, + linear_map_kf_hash_count); + goto err; + } + + /* allocate kfence pool early */ + kfence_pool = memblock_phys_alloc_range(KFENCE_POOL_SIZE, PAGE_SIZE, + MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ANYWHERE); + if (!kfence_pool) { + pr_err("%s: memblock for kfence pool (%lu) failed\n", __func__, + KFENCE_POOL_SIZE); + memblock_free(linear_map_kf_hash_slots, + linear_map_kf_hash_count); + linear_map_kf_hash_count = 0; + goto err; + } + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + + return; +err: + pr_info("Disabling kfence\n"); + disable_kfence(); +} + +static __init void hash_kfence_map_pool(void) +{ + unsigned long kfence_pool_start, kfence_pool_end; + unsigned long prot = pgprot_val(PAGE_KERNEL); + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; + + if (!kfence_pool) + return; + + kfence_pool_start = (unsigned long) __va(kfence_pool); + kfence_pool_end = kfence_pool_start + KFENCE_POOL_SIZE; + __kfence_pool = (char *) kfence_pool_start; + BUG_ON(htab_bolt_mapping(kfence_pool_start, kfence_pool_end, + kfence_pool, prot, mmu_linear_psize, + mmu_kernel_ssize)); + update_page_count(mmu_linear_psize, KFENCE_POOL_SIZE >> pshift); + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); +} + +static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) +{ + unsigned long vaddr = (unsigned long) __va(paddr); + unsigned long lmi = (vaddr - (unsigned long)__kfence_pool) + >> PAGE_SHIFT; + + if (!kfence_pool) + return; + BUG_ON(!is_kfence_address((void *)vaddr)); + BUG_ON(lmi >= linear_map_kf_hash_count); + linear_map_kf_hash_slots[lmi] = slot | 0x80; +} + +static int hash_kfence_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + WARN_ON_ONCE(!linear_map_kf_hash_count); + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = (vaddr - (unsigned long)__kfence_pool) >> PAGE_SHIFT; + + /* Ideally this should never happen */ + if (lmi >= linear_map_kf_hash_count) { + WARN_ON_ONCE(1); + continue; + } + + if (enable) + kernel_map_linear_page(vaddr, lmi, + linear_map_kf_hash_slots, + &linear_map_kf_hash_lock); + else + kernel_unmap_linear_page(vaddr, lmi, + linear_map_kf_hash_slots, + &linear_map_kf_hash_lock); + } + local_irq_restore(flags); + return 0; +} +#else +static inline void hash_kfence_alloc_pool(void) {} +static inline void hash_kfence_map_pool(void) {} +static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) {} +static int __maybe_unused +hash_kfence_map_pages(struct page *page, int numpages, int enable) +{ + return 0; +} +#endif + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +int hash__kernel_map_pages(struct page *page, int numpages, int enable) +{ + void *vaddr = page_address(page); + + if (is_kfence_address(vaddr)) + return hash_kfence_map_pages(page, numpages, enable); + else + return hash_debug_pagealloc_map_pages(page, numpages, enable); +} + +static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) +{ + if (is_kfence_address(__va(paddr))) + hash_kfence_add_slot(paddr, slot); + else + hash_debug_pagealloc_add_slot(paddr, slot); +} +#else +static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) {} +#endif + /* * 'R' and 'C' update notes: * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* @@ -212,9 +577,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags else rflags |= 0x3; } + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); } else { if (pteflags & _PAGE_RWX) rflags |= 0x2; + /* + * We should never hit this in normal fault handling because + * a permission check (check_pte_access()) will bubble this + * to higher level linux handler even for PAGE_NONE. + */ + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } @@ -307,7 +679,7 @@ repeat: ssize); if (ret == -1) { /* - * Try to to keep bolted entries in primary. + * Try to keep bolted entries in primary. * Remove non bolted entries and try insert again */ ret = mmu_hash_ops.hpte_remove(hpteg); @@ -326,11 +698,8 @@ repeat: break; cond_resched(); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - (paddr >> PAGE_SHIFT) < linear_map_hash_count) - linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; -#endif /* CONFIG_DEBUG_PAGEALLOC */ + /* add slot info in debug_pagealloc / kfence linear map */ + hash_linear_map_add_slot(paddr, ret); } return ret < 0 ? ret : 0; } @@ -375,7 +744,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, return ret; } -static bool disable_1tb_segments = false; +static bool disable_1tb_segments __ro_after_init; static int __init parse_disable_1tb_segments(char *p) { @@ -384,6 +753,40 @@ static int __init parse_disable_1tb_segments(char *p) } early_param("disable_1tb_segments", parse_disable_1tb_segments); +bool stress_hpt_enabled __initdata; + +static int __init parse_stress_hpt(char *p) +{ + stress_hpt_enabled = true; + return 0; +} +early_param("stress_hpt", parse_stress_hpt); + +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key); + +/* + * per-CPU array allocated if we enable stress_hpt. + */ +#define STRESS_MAX_GROUPS 16 +struct stress_hpt_struct { + unsigned long last_group[STRESS_MAX_GROUPS]; +}; + +static inline int stress_nr_groups(void) +{ + /* + * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries + * to allow practical forward progress. Bare metal returns 1, which + * seems to help uncover more bugs. + */ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return STRESS_MAX_GROUPS; + else + return 1; +} + +static struct stress_hpt_struct *stress_hpt_struct; + static int __init htab_dt_scan_seg_sizes(unsigned long node, const char *uname, int depth, void *data) @@ -552,7 +955,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, block_size = be64_to_cpu(addr_prop[1]); if (block_size != (16 * GB)) return 0; - printk(KERN_INFO "Huge page(16GB) memory: " + pr_info("Huge page(16GB) memory: " "addr = 0x%lX size = 0x%lX pages = %d\n", phys_addr, block_size, expected_pages); if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { @@ -563,7 +966,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, } #endif /* CONFIG_HUGETLB_PAGE */ -static void mmu_psize_set_default_penc(void) +static void __init mmu_psize_set_default_penc(void) { int bpsize, apsize; for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) @@ -573,7 +976,7 @@ static void mmu_psize_set_default_penc(void) #ifdef CONFIG_PPC_64K_PAGES -static bool might_have_hea(void) +static bool __init might_have_hea(void) { /* * The HEA ethernet adapter requires awareness of the @@ -644,7 +1047,7 @@ static void __init htab_scan_page_sizes(void) * low-order N bits as the encoding for the 2^(12+N) byte page size * (if it exists). */ -static void init_hpte_page_sizes(void) +static void __init init_hpte_page_sizes(void) { long int ap, bp; long int shift, penc; @@ -677,7 +1080,7 @@ static void __init htab_init_page_sizes(void) bool aligned = true; init_hpte_page_sizes(); - if (!debug_pagealloc_enabled()) { + if (!hash_supports_debug_pagealloc() && !kfence_early_init_enabled()) { /* * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default @@ -735,7 +1138,7 @@ static void __init htab_init_page_sizes(void) mmu_vmemmap_psize = mmu_virtual_psize; #endif /* CONFIG_SPARSEMEM_VMEMMAP */ - printk(KERN_DEBUG "Page orders: linear mapping = %d, " + pr_info("Page orders: linear mapping = %d, " "virtual = %d, io = %d" #ifdef CONFIG_SPARSEMEM_VMEMMAP ", vmemmap = %d" @@ -834,6 +1237,7 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid, pgprot_t prot) { int rc; + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; if (end >= H_VMALLOC_START) { pr_warn("Outside the supported range\n"); @@ -851,17 +1255,22 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, mmu_kernel_ssize); BUG_ON(rc2 && (rc2 != -ENOENT)); } + update_page_count(mmu_linear_psize, (end - start) >> pshift); return rc; } int hash__remove_section_mapping(unsigned long start, unsigned long end) { + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; + int rc = htab_remove_mapping(start, end, mmu_linear_psize, mmu_kernel_ssize); if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) pr_warn("Hash collision while resizing HPT\n"); + if (!rc) + update_page_count(mmu_linear_psize, -((end - start) >> pshift)); return rc; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -880,25 +1289,64 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, pr_info("Partition table %p\n", partition_tb); } +void hpt_clear_stress(void); +static struct timer_list stress_hpt_timer; +static void stress_hpt_timer_fn(struct timer_list *timer) +{ + int next_cpu; + + hpt_clear_stress(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + tlbiel_all(); + + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer_on(&stress_hpt_timer, next_cpu); +} + static void __init htab_initialize(void) { unsigned long table; unsigned long pteg_count; unsigned long prot; - phys_addr_t base = 0, size = 0, end; + phys_addr_t base = 0, size = 0, end, limit = MEMBLOCK_ALLOC_ANYWHERE; u64 i; + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; DBG(" -> htab_initialize()\n"); + if (firmware_has_feature(FW_FEATURE_LPAR)) + limit = ppc64_rma_size; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { mmu_kernel_ssize = MMU_SEGSIZE_1T; mmu_highuser_ssize = MMU_SEGSIZE_1T; - printk(KERN_INFO "Using 1TB segments\n"); + pr_info("Using 1TB segments\n"); } if (stress_slb_enabled) static_branch_enable(&stress_slb_key); + if (no_slb_preload) + static_branch_enable(&no_slb_preload_key); + + if (stress_hpt_enabled) { + unsigned long tmp; + static_branch_enable(&stress_hpt_key); + // Too early to use nr_cpu_ids, so use NR_CPUS + tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS, + __alignof__(struct stress_hpt_struct), + MEMBLOCK_LOW_LIMIT, limit); + memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS); + stress_hpt_struct = __va(tmp); + + timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer(&stress_hpt_timer); + } + /* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages. @@ -924,23 +1372,10 @@ static void __init htab_initialize(void) mmu_hash_ops.hpte_clear_all(); #endif } else { - unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE; - -#ifdef CONFIG_PPC_CELL - /* - * Cell may require the hash table down low when using the - * Axon IOMMU in order to fit the dynamic region over it, see - * comments in cell/iommu.c - */ - if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) { - limit = 0x80000000; - pr_info("Hash table forced below 2G for Axon IOMMU\n"); - } -#endif /* CONFIG_PPC_CELL */ table = memblock_phys_alloc_range(htab_size_bytes, htab_size_bytes, - 0, limit); + MEMBLOCK_LOW_LIMIT, limit); if (!table) panic("ERROR: Failed to allocate %pa bytes below %pa\n", &htab_size_bytes, &limit); @@ -965,25 +1400,15 @@ static void __init htab_initialize(void) prot = pgprot_val(PAGE_KERNEL); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled()) { - linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; - linear_map_hash_slots = memblock_alloc_try_nid( - linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, - ppc64_rma_size, NUMA_NO_NODE); - if (!linear_map_hash_slots) - panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", - __func__, linear_map_hash_count, &ppc64_rma_size); - } -#endif /* CONFIG_DEBUG_PAGEALLOC */ - + hash_debug_pagealloc_alloc_slots(); + hash_kfence_alloc_pool(); /* create bolted the linear mapping in the hash table */ for_each_mem_range(i, &base, &end) { size = end - base; base = (unsigned long)__va(base); - DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", - base, size, prot); + pr_debug("creating mapping for region: 0x%pa..0x%pa (prot: %lx)\n", + &base, &size, prot); if ((base + size) >= H_VMALLOC_START) { pr_warn("Outside the supported range\n"); @@ -992,7 +1417,10 @@ static void __init htab_initialize(void) BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); + + update_page_count(mmu_linear_psize, size >> pshift); } + hash_kfence_map_pool(); memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); /* @@ -1012,6 +1440,8 @@ static void __init htab_initialize(void) BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, __pa(tce_alloc_start), prot, mmu_linear_psize, mmu_kernel_ssize)); + update_page_count(mmu_linear_psize, + (tce_alloc_end - tce_alloc_start) >> pshift); } @@ -1066,10 +1496,6 @@ void __init hash__early_init_mmu(void) __pmd_table_size = H_PMD_TABLE_SIZE; __pud_table_size = H_PUD_TABLE_SIZE; __pgd_table_size = H_PGD_TABLE_SIZE; - /* - * 4k use hugepd format, so for hash set then to - * zero - */ __pmd_val_bits = HASH_PMD_VAL_BITS; __pud_val_bits = HASH_PUD_VAL_BITS; __pgd_val_bits = HASH_PGD_VAL_BITS; @@ -1091,7 +1517,7 @@ void __init hash__early_init_mmu(void) ps3_early_mm_init(); else if (firmware_has_feature(FW_FEATURE_LPAR)) hpte_init_pseries(); - else if (IS_ENABLED(CONFIG_PPC_NATIVE)) + else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE)) hpte_init_native(); if (!mmu_hash_ops.hpte_insert) @@ -1147,25 +1573,25 @@ void hash__early_init_mmu_secondary(void) */ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) { - struct page *page; + struct folio *folio; if (!pfn_valid(pte_pfn(pte))) return pp; - page = pte_page(pte); + folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f) && + !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); } else pp |= HPTE_R_N; } return pp; } -#ifdef CONFIG_PPC_MM_SLICES static unsigned int get_paca_psize(unsigned long addr) { unsigned char *psizes; @@ -1182,12 +1608,6 @@ static unsigned int get_paca_psize(unsigned long addr) return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; } -#else -unsigned int get_paca_psize(unsigned long addr) -{ - return get_paca()->mm_ctx_user_psize; -} -#endif /* * Demote a segment to using 4k pages. @@ -1199,7 +1619,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) if (get_slice_psize(mm, addr) == MMU_PAGE_4K) return; slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); - copro_flush_all_slbs(mm); +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { copy_mm_to_paca(mm); @@ -1244,7 +1666,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) spp >>= 30 - 2 * ((ea >> 12) & 0xf); /* - * 0 -> full premission + * 0 -> full permission * 1 -> Read only * 2 -> no access. * We return the flag that need to be cleared. @@ -1385,6 +1807,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, goto bail; } + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) { + if (hugeshift == PMD_SHIFT && psize == MMU_PAGE_16M) + hugeshift = mmu_psize_defs[MMU_PAGE_16M].shift; + if (hugeshift == PUD_SHIFT && psize == MMU_PAGE_16G) + hugeshift = mmu_psize_defs[MMU_PAGE_16G].shift; + } + /* * Add _PAGE_PRESENT to the required access perm. If there are parallel * updates to the pte that can possibly clear _PAGE_PTE, catch that too. @@ -1457,11 +1886,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, * in vmalloc space, so switch vmalloc * to 4k pages */ - printk(KERN_ALERT "Reducing vmalloc segment " + pr_alert("Reducing vmalloc segment " "to 4kB pages because of " "non-cacheable mapping\n"); psize = mmu_vmalloc_psize = MMU_PAGE_4K; - copro_flush_all_slbs(mm); +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif } } @@ -1522,8 +1953,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -DECLARE_INTERRUPT_HANDLER(__do_hash_fault); -DEFINE_INTERRUPT_HANDLER(__do_hash_fault) +DEFINE_INTERRUPT_HANDLER(do_hash_fault) { unsigned long ea = regs->dar; unsigned long dsisr = regs->dsisr; @@ -1566,7 +1996,7 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault) err = hash_page_mm(mm, ea, access, TRAP(regs), flags); if (unlikely(err < 0)) { - // failed to instert a hash PTE due to an hypervisor error + // failed to insert a hash PTE due to an hypervisor error if (user_mode(regs)) { if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2) _exception(SIGSEGV, regs, SEGV_ACCERR, ea); @@ -1582,36 +2012,6 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault) } } -/* - * The _RAW interrupt entry checks for the in_nmi() case before - * running the full handler. - */ -DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault) -{ - /* - * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then - * don't call hash_page, just fail the fault. This is required to - * prevent re-entrancy problems in the hash code, namely perf - * interrupts hitting while something holds H_PAGE_BUSY, and taking a - * hash fault. See the comment in hash_preload(). - * - * We come here as a result of a DSI at a point where we don't want - * to call hash_page, such as when we are accessing memory (possibly - * user memory) inside a PMU interrupt that occurred while interrupts - * were soft-disabled. We want to invoke the exception handler for - * the access, or panic if there isn't a handler. - */ - if (unlikely(in_nmi())) { - do_bad_page_fault_segv(regs); - return 0; - } - - __do_hash_fault(regs); - - return 0; -} - -#ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { int psize = get_slice_psize(mm, ea); @@ -1628,12 +2028,6 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) return true; } -#else -static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) -{ - return true; -} -#endif static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, bool is_exec, unsigned long trap) @@ -1677,26 +2071,18 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* - * __hash_page_* must run with interrupts off, as it sets the - * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any - * time and may take a hash fault reading the user stack, see - * read_user_stack_slow() in the powerpc/perf code. + * __hash_page_* must run with interrupts off, including PMI interrupts + * off, as it sets the H_PAGE_BUSY bit. * - * If that takes a hash fault on the same page as we lock here, it - * will bail out when seeing H_PAGE_BUSY set, and retry the access - * leading to an infinite loop. - * - * Disabling interrupts here does not prevent perf interrupts, but it - * will prevent them taking hash faults (see the NMI test in - * do_hash_page), then read_user_stack's copy_from_user_nofault will - * fail and perf will fall back to read_user_stack_slow(), which - * walks the Linux page tables. + * It's otherwise possible for perf interrupts to hit at any time and + * may take a hash fault reading the user stack, which could take a + * hash miss and deadlock on the same H_PAGE_BUSY bit. * * Interrupts must also be off for the duration of the * mm_is_thread_local test and update, to prevent preempt running the * mm on another CPU (XXX: this may be racy vs kthread_use_mm). */ - local_irq_save(flags); + powerpc_local_irq_pmu_save(flags); /* Is that local to this CPU ? */ if (mm_is_thread_local(mm)) @@ -1721,7 +2107,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, mm_ctx_user_psize(&mm->context), pte_val(*ptep)); - local_irq_restore(flags); + powerpc_local_irq_pmu_restore(flags); } /* @@ -1732,7 +2118,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* @@ -1742,9 +2128,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, unsigned long trap; bool is_exec; - if (radix_enabled()) - return; - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) return; @@ -1941,72 +2324,68 @@ repeat: return slot; } -#ifdef CONFIG_DEBUG_PAGEALLOC -static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) +void hpt_clear_stress(void) { - unsigned long hash; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); - long ret; - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - - /* Don't create HPTE entries for bad address */ - if (!vsid) - return; - - ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, - HPTE_V_BOLTED, - mmu_linear_psize, mmu_kernel_ssize); - - BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); - BUG_ON(linear_map_hash_slots[lmi] & 0x80); - linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + int cpu = raw_smp_processor_id(); + int g; + + for (g = 0; g < stress_nr_groups(); g++) { + unsigned long last_group; + last_group = stress_hpt_struct[cpu].last_group[g]; + + if (last_group != -1UL) { + int i; + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[g] = -1; + } + } } -static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) +void hpt_do_stress(unsigned long ea, unsigned long hpte_group) { - unsigned long hash, hidx, slot; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long last_group; + int cpu = raw_smp_processor_id(); - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); - hidx = linear_map_hash_slots[lmi] & 0x7f; - linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, - mmu_linear_psize, - mmu_kernel_ssize, 0); -} + last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1]; + if (hpte_group == last_group) + return; -void __kernel_map_pages(struct page *page, int numpages, int enable) -{ - unsigned long flags, vaddr, lmi; - int i; + if (last_group != -1UL) { + int i; + /* + * Concurrent CPUs might be inserting into this group, so + * give up after a number of iterations, to prevent a live + * lock. + */ + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1; + } - local_irq_save(flags); - for (i = 0; i < numpages; i++, page++) { - vaddr = (unsigned long)page_address(page); - lmi = __pa(vaddr) >> PAGE_SHIFT; - if (lmi >= linear_map_hash_count) - continue; - if (enable) - kernel_map_linear_page(vaddr, lmi); - else - kernel_unmap_linear_page(vaddr, lmi); + if (ea >= PAGE_OFFSET) { + /* + * We would really like to prefetch to get the TLB loaded, then + * remove the PTE before returning from fault interrupt, to + * increase the hash fault rate. + * + * Unfortunately QEMU TCG does not model the TLB in a way that + * makes this possible, and systemsim (mambo) emulator does not + * bring in TLBs with prefetches (although loads/stores do + * work for non-CI PTEs). + * + * So remember this PTE and clear it on the next hash fault. + */ + memmove(&stress_hpt_struct[cpu].last_group[1], + &stress_hpt_struct[cpu].last_group[0], + (stress_nr_groups() - 1) * sizeof(unsigned long)); + stress_hpt_struct[cpu].last_group[0] = hpte_group; } - local_irq_restore(flags); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) @@ -2072,7 +2451,9 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n") static int __init hash64_debugfs(void) { - debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL, + if (radix_enabled()) + return 0; + debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, &fops_hpt_order); return 0; } @@ -2086,3 +2467,20 @@ void __init print_system_hash_info(void) if (htab_hash_mask) pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* + * If we are using 1TB segments and we are allowed to randomise + * the heap, we can put it above 1TB so it is backed by a 1TB + * segment. Otherwise the heap will be in the bottom 1TB + * which always uses 256MB segments and this may result in a + * performance penalty. + */ + if (is_32bit_task()) + return randomize_page(mm->brk, SZ_32M); + else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T) + return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G); + else + return randomize_page(mm->brk, SZ_1G); +} diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index a688e1324ae5..2bcbbf9d85ac 100644 --- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -16,6 +16,7 @@ unsigned int hpage_shift; EXPORT_SYMBOL(hpage_shift); +#ifdef CONFIG_PPC_64S_HASH_MMU int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize, unsigned int shift, unsigned int mmu_psize) @@ -52,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, /* If PTE permissions don't match, take page fault */ if (unlikely(!check_pte_access(access, old_pte))) return 1; + /* + * If hash-4k, hugepages use seeral contiguous PxD entries + * so bail out and let mm make the page young or dirty + */ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) { + if (!(old_pte & _PAGE_ACCESSED)) + return 1; + if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY)) + return 1; + } /* * Try to lock the PTE, add ACCESSED and DIRTY if it was @@ -63,7 +74,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); /* Make sure this is a hugetlb entry */ - if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) + if (old_pte & H_PAGE_THP_HUGE) return 0; rflags = htab_convert_pte_flags(new_pte, flags); @@ -122,6 +133,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } +#endif pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -141,14 +153,17 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { + unsigned long psize; if (radix_enabled()) return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + + psize = huge_page_size(hstate_vma(vma)); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } -void hugetlbpage_init_default(void) +void __init hugetlbpage_init_defaultsize(void) { /* Set default large page size. Currently, we pick 16M or 1M * depending on what is available diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h index 5045048ce244..cad08d83369c 100644 --- a/arch/powerpc/mm/book3s64/internal.h +++ b/arch/powerpc/mm/book3s64/internal.h @@ -13,7 +13,23 @@ static inline bool stress_slb(void) return static_branch_unlikely(&stress_slb_key); } -void slb_setup_new_exec(void); +extern bool stress_hpt_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_hpt_key); + +static inline bool stress_hpt(void) +{ + return static_branch_unlikely(&stress_hpt_key); +} + +extern bool no_slb_preload; +DECLARE_STATIC_KEY_FALSE(no_slb_preload_key); +static inline bool slb_preload_disabled(void) +{ + return static_branch_unlikely(&no_slb_preload_key); +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group); void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush); diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index cd18e94d0843..c0e8d597e4cb 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } mmap_read_lock(mm); - chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); for (entry = 0; entry < entries; entry += chunk) { @@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n, FOLL_WRITE | FOLL_LONGTERM, - mem->hpages + entry, NULL); + mem->hpages + entry); if (ret == n) { pinned += n; continue; @@ -305,24 +305,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(mm_iommu_lookup); -struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, - unsigned long ua, unsigned long size) -{ - struct mm_iommu_table_group_mem_t *mem, *ret = NULL; - - list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, - next) { - if ((mem->ua <= ua) && - (ua + size <= mem->ua + - (mem->entries << PAGE_SHIFT))) { - ret = mem; - break; - } - } - - return ret; -} - struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries) { @@ -369,56 +351,6 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, } EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); -long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned int pageshift, unsigned long *hpa) -{ - const long entry = (ua - mem->ua) >> PAGE_SHIFT; - unsigned long *pa; - - if (entry >= mem->entries) - return -EFAULT; - - if (pageshift > mem->pageshift) - return -EFAULT; - - if (!mem->hpas) { - *hpa = mem->dev_hpa + (ua - mem->ua); - return 0; - } - - pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); - if (!pa) - return -EFAULT; - - *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); - - return 0; -} - -extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) -{ - struct mm_iommu_table_group_mem_t *mem; - long entry; - void *va; - unsigned long *pa; - - mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); - if (!mem) - return; - - if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) - return; - - entry = (ua - mem->ua) >> PAGE_SHIFT; - va = &mem->hpas[entry]; - - pa = (void *) vmalloc_to_phys(va); - if (!pa) - return; - - *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; -} - bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, unsigned int pageshift, unsigned long *size) { diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index c10fc8a72fb3..fb9dcf9ca599 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -31,7 +31,8 @@ static int alloc_context_id(int min_id, int max_id) return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); } -void hash__reserve_context_id(int id) +#ifdef CONFIG_PPC_64S_HASH_MMU +void __init hash__reserve_context_id(int id) { int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); @@ -50,7 +51,9 @@ int hash__alloc_context_id(void) return alloc_context_id(MIN_USER_CONTEXT, max); } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +#endif +#ifdef CONFIG_PPC_64S_HASH_MMU static int realloc_context_ids(mm_context_t *ctx) { int i, id; @@ -147,9 +150,14 @@ static int hash__init_new_context(struct mm_struct *mm) void hash__setup_new_exec(void) { slice_setup_new_exec(); - - slb_setup_new_exec(); } +#else +static inline int hash__init_new_context(struct mm_struct *mm) +{ + BUILD_BUG(); + return 0; +} +#endif static int radix__init_new_context(struct mm_struct *mm) { @@ -175,7 +183,9 @@ static int radix__init_new_context(struct mm_struct *mm) */ asm volatile("ptesync;isync" : : : "memory"); +#ifdef CONFIG_PPC_64S_HASH_MMU mm->context.hash_context = NULL; +#endif return index; } @@ -213,28 +223,36 @@ EXPORT_SYMBOL_GPL(__destroy_context); static void destroy_contexts(mm_context_t *ctx) { - int index, context_id; + if (radix_enabled()) { + ida_free(&mmu_context_ida, ctx->id); + } else { +#ifdef CONFIG_PPC_64S_HASH_MMU + int index, context_id; - for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { - context_id = ctx->extended_id[index]; - if (context_id) - ida_free(&mmu_context_ida, context_id); + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } + kfree(ctx->hash_context); +#else + BUILD_BUG(); // radix_enabled() should be constant true +#endif } - kfree(ctx->hash_context); } static void pmd_frag_destroy(void *pmd_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pmd_frag); + ptdesc = virt_to_ptdesc(pmd_frag); /* drop all the pending references */ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 9ffa65074cb0..e3485db7de02 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,9 +6,11 @@ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/memblock.h> -#include <misc/cxl-base.h> +#include <linux/memremap.h> +#include <linux/pkeys.h> +#include <linux/debugfs.h> +#include <linux/proc_fs.h> -#include <asm/debugfs.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/trace.h> @@ -22,11 +24,31 @@ #include "internal.h" +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif + unsigned long __pmd_frag_nr; EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; EXPORT_SYMBOL(__pmd_frag_size_shift); +#ifdef CONFIG_KFENCE +extern bool kfence_early_init; +static int __init parse_kfence_early_init(char *arg) +{ + int val; + + if (get_option(&arg, &val)) + kfence_early_init = !!val; + return 0; +} +early_param("kfence.sample_interval", parse_kfence_early_init); +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is called when relaxing access to a hugepage. It's also called in the page @@ -40,7 +62,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, { int changed; #ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + WARN_ON(!pmd_trans_huge(*pmdp)); assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp)); #endif changed = !pmd_same(*(pmdp), entry); @@ -55,11 +77,38 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, return changed; } +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_1G here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -75,12 +124,29 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_large(pmd))); + WARN_ON(!(pmd_leaf(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_leaf(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); +} + static void do_serialize(void *arg) { /* We've taken the IPI, so try to trim the mask while here */ @@ -91,14 +157,14 @@ static void do_serialize(void *arg) } /* - * Serialize against find_current_mm_pte which does lock-less + * Serialize against __find_linux_pte() which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. + * __find_linux_pte() to finish. */ void serialize_against_pte_lookup(struct mm_struct *mm) { @@ -115,18 +181,30 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, { unsigned long old_pmd; + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return __pmd(old_pmd); } +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + unsigned long old_pud; + + VM_WARN_ON_ONCE(!pud_present(*pudp)); + old_pud = pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return __pud(old_pud); +} + pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full) { pmd_t pmd; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); - VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)) || + !pmd_present(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); /* * if it not a fullmm flush, then we can possibly end up converting @@ -138,11 +216,34 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, return pmd; } +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON(!pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; +} + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) { return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); } +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + /* * At some point we should be able to get rid of * pmd_mkhuge() and mk_huge_pmd() when we update all the @@ -157,9 +258,13 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); } -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) { - return pfn_pmd(page_to_pfn(page), pgprot); + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); } pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) @@ -170,10 +275,19 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmdv &= _HPAGE_CHG_MASK; return pmd_set_protbits(__pmd(pmdv), newprot); } + +pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + unsigned long pudv; + + pudv = pud_val(pud); + pudv &= _HPAGE_CHG_MASK; + return pud_set_protbits(__pud(pudv), newprot); +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* For use by kexec */ -void mmu_cleanup_all(void) +/* For use by kexec, called with MMU off */ +notrace void mmu_cleanup_all(void) { if (radix_enabled()) radix__mmu_cleanup_all(); @@ -207,17 +321,8 @@ void __init mmu_partition_table_init(void) unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; unsigned long ptcr; - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); /* Initialize the Partition Table with no entries */ - partition_tb = memblock_alloc(patb_size, patb_size); - if (!partition_tb) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, patb_size, patb_size); - - /* - * update partition table control register, - * 64 K size. - */ + partition_tb = memblock_alloc_or_panic(patb_size, patb_size); ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); @@ -302,22 +407,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm) static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); + if (!pagetable_pmd_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -327,12 +432,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return - * the allocated page with single fragement + * If we find ptdesc_page set, we return + * the allocated page with single fragment * count. */ if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); mm->context.pmd_frag = ret + PMD_FRAG_SIZE; } spin_unlock(&mm->page_table_lock); @@ -353,15 +458,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) void pmd_fragment_free(unsigned long *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -377,18 +482,6 @@ static inline void pgtable_free(void *table, int index) case PUD_INDEX: __pud_free(table); break; -#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) - /* 16M hugepd directory at pud level */ - case HTLB_16M_INDEX: - BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); - break; - /* 16G hugepd directory at the pgd level */ - case HTLB_16G_INDEX: - BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); - break; -#endif /* We don't free pgd table via RCU callback */ default: BUG(); @@ -417,20 +510,21 @@ atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; void arch_report_meminfo(struct seq_file *m) { - /* - * Hash maps the memory with one size mmu_linear_psize. - * So don't bother to print these on hash - */ - if (!radix_enabled()) - return; seq_printf(m, "DirectMap4k: %8lu kB\n", atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); - seq_printf(m, "DirectMap64k: %8lu kB\n", + seq_printf(m, "DirectMap64k: %8lu kB\n", atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); - seq_printf(m, "DirectMap2M: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); - seq_printf(m, "DirectMap1G: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); + if (radix_enabled()) { + seq_printf(m, "DirectMap2M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); + seq_printf(m, "DirectMap1G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); + } else { + seq_printf(m, "DirectMap16M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_16M]) << 14); + seq_printf(m, "DirectMap16G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_16G]) << 24); + } } #endif /* CONFIG_PROC_FS */ @@ -459,6 +553,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, set_pte_at(vma->vm_mm, addr, ptep, pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd @@ -480,11 +575,12 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } +#endif /* * Does the CPU support tlbie? */ -bool tlbie_capable __read_mostly = true; +bool tlbie_capable __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE); EXPORT_SYMBOL(tlbie_capable); /* @@ -492,7 +588,7 @@ EXPORT_SYMBOL(tlbie_capable); * address spaces? tlbie may still be used for nMMU accelerators, and for KVM * guest address spaces. */ -bool tlbie_enabled __read_mostly = true; +bool tlbie_enabled __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE); static int __init setup_disable_tlbie(char *str) { @@ -520,9 +616,50 @@ static int __init pgtable_debugfs_setup(void) * invalidated as expected. */ debugfs_create_bool("tlbie_enabled", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &tlbie_enabled); return 0; } arch_initcall(pgtable_debugfs_setup); + +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN) +/* + * Override the generic version in mm/memremap.c. + * + * With hash translation, the direct-map range is mapped with just one + * page size selected by htab_init_page_sizes(). Consult + * mmu_psize_defs[] to determine the minimum page size alignment. +*/ +unsigned long memremap_compat_align(void) +{ + if (!radix_enabled()) { + unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; + return max(SUBSECTION_SIZE, 1UL << shift); + } + + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif + +pgprot_t vm_get_page_prot(vm_flags_t vm_flags) +{ + unsigned long prot; + + /* Radix supports execute-only, but protection_map maps X -> RX */ + if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) + vm_flags |= VM_READ; + + prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]); + + if (vm_flags & VM_SAO) + prot |= _PAGE_SAO; + +#ifdef CONFIG_PPC_MEM_KEYS + prot |= vmflag_to_pte_pkey_bits(vm_flags); +#endif + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index a2d9ad138709..a974baf8f327 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -10,6 +10,7 @@ #include <asm/mmu.h> #include <asm/setup.h> #include <asm/smp.h> +#include <asm/firmware.h> #include <linux/pkeys.h> #include <linux/of_fdt.h> @@ -66,7 +67,7 @@ static int __init dt_scan_storage_keys(unsigned long node, return 1; } -static int scan_pkey_feature(void) +static int __init scan_pkey_feature(void) { int ret; int pkeys_total = 0; @@ -88,7 +89,8 @@ static int scan_pkey_feature(void) unsigned long pvr = mfspr(SPRN_PVR); if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || - PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9) + PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 || + PVR_VER(pvr) == PVR_HX_C2000) pkeys_total = 32; } } @@ -290,7 +292,7 @@ void setup_kuap(bool disabled) if (smp_processor_id() == boot_cpuid) { pr_info("Activating Kernel Userspace Access Prevention\n"); - cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUAP; + cur_cpu_spec->mmu_features |= MMU_FTR_KUAP; } /* diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 23d3e08911d3..35fd2a95be24 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,61 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); -} - -/* - * A vairant of hugetlb_get_unmapped_area doing topdown search - * FIXME!! should we do as x86 does or non hugetlb area does ? - * ie, use topdown or not based on mmap_is_legacy check ? - */ -unsigned long -radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct hstate *h = hstate_file(file); - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - /* - * We are always doing an topdown search here. Slice code - * does that too. - */ - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - - return vm_unmapped_area(&info); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, @@ -101,14 +47,17 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, pte_t old_pte, pte_t pte) { struct mm_struct *mm = vma->vm_mm; + unsigned long psize = huge_page_size(hstate_vma(vma)); /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && - (atomic_read(&mm->context.copros) > 0)) + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + atomic_read(&mm->context.copros) > 0) radix__flush_hugetlb_page(vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e50ddf129c15..73977dbabcf2 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -17,6 +17,7 @@ #include <linux/hugetlb.h> #include <linux/string_helpers.h> #include <linux/memory.h> +#include <linux/kfence.h> #include <asm/pgalloc.h> #include <asm/mmu_context.h> @@ -30,12 +31,14 @@ #include <asm/trace.h> #include <asm/uaccess.h> #include <asm/ultravisor.h> +#include <asm/set_memory.h> +#include <asm/kfence.h> #include <trace/events/thp.h> -unsigned int mmu_pid_bits; +#include <mm/mmu_decl.h> + unsigned int mmu_base_pid; -unsigned long radix_mem_block_size __ro_after_init; static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) @@ -203,14 +206,14 @@ static void radix__change_memory_range(unsigned long start, unsigned long end, pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; - if (pud_is_leaf(*pudp)) { + if (pud_leaf(*pudp)) { ptep = (pte_t *)pudp; goto update_the_pte; } pmdp = pmd_alloc(&init_mm, pudp, idx); if (!pmdp) continue; - if (pmd_is_leaf(*pmdp)) { + if (pmd_leaf(*pmdp)) { ptep = pmdp_ptep(pmdp); goto update_the_pte; } @@ -229,9 +232,17 @@ void radix__mark_rodata_ro(void) unsigned long start, end; start = (unsigned long)_stext; - end = (unsigned long)__init_begin; + end = (unsigned long)__end_rodata; radix__change_memory_range(start, end, _PAGE_WRITE); + + for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { + end = start + PAGE_SIZE; + if (overlaps_interrupt_vector_text(start, end)) + radix__change_memory_range(start, end, _PAGE_WRITE); + else + break; + } } void radix__mark_initmem_nx(void) @@ -260,21 +271,44 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX - if (addr < __pa_symbol(__init_begin)) - return __pa_symbol(__init_begin); + unsigned long stext_phys; + + stext_phys = __pa_symbol(_stext); + + // Relocatable kernel running at non-zero real address + if (stext_phys != 0) { + // The end of interrupts code at zero is a rodata boundary + unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; + if (addr < end_intr) + return end_intr; + + // Start of relocated kernel text is a rodata boundary + if (addr < stext_phys) + return stext_phys; + } + + if (addr < __pa_symbol(__srwx_boundary)) + return __pa_symbol(__srwx_boundary); #endif return end; } static int __meminit create_physical_mapping(unsigned long start, unsigned long end, - unsigned long max_mapping_size, - int nid, pgprot_t _prot) + int nid, pgprot_t _prot, + unsigned long mapping_sz_limit) { unsigned long vaddr, addr, mapping_size = 0; bool prev_exec, exec = false; pgprot_t prot; int psize; + unsigned long max_mapping_size = memory_block_size; + + if (mapping_sz_limit < max_mapping_size) + max_mapping_size = mapping_sz_limit; + + if (debug_pagealloc_enabled()) + max_mapping_size = PAGE_SIZE; start = ALIGN(start, PAGE_SIZE); end = ALIGN_DOWN(end, PAGE_SIZE); @@ -328,14 +362,70 @@ static int __meminit create_physical_mapping(unsigned long start, return 0; } +#ifdef CONFIG_KFENCE +static __init phys_addr_t alloc_kfence_pool(void) +{ + phys_addr_t kfence_pool; + + /* + * TODO: Support to enable KFENCE after bootup depends on the ability to + * split page table mappings. As such support is not currently + * implemented for radix pagetables, support enabling KFENCE + * only at system startup for now. + * + * After support for splitting mappings is available on radix, + * alloc_kfence_pool() & map_kfence_pool() can be dropped and + * mapping for __kfence_pool memory can be + * split during arch_kfence_init_pool(). + */ + if (!kfence_early_init) + goto no_kfence; + + kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + if (!kfence_pool) + goto no_kfence; + + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + return kfence_pool; + +no_kfence: + disable_kfence(); + return 0; +} + +static __init void map_kfence_pool(phys_addr_t kfence_pool) +{ + if (!kfence_pool) + return; + + if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, + -1, PAGE_KERNEL, PAGE_SIZE)) + goto err; + + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = __va(kfence_pool); + return; + +err: + memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE); + disable_kfence(); +} +#else +static inline phys_addr_t alloc_kfence_pool(void) { return 0; } +static inline void map_kfence_pool(phys_addr_t kfence_pool) { } +#endif + static void __init radix_init_pgtable(void) { + phys_addr_t kfence_pool; unsigned long rts_field; phys_addr_t start, end; u64 i; /* We don't support slb for radix */ - mmu_slb_size = 0; + slb_set_size(0); + + kfence_pool = alloc_kfence_pool(); /* * Create the linear mapping @@ -353,22 +443,18 @@ static void __init radix_init_pgtable(void) } WARN_ON(create_physical_mapping(start, end, - radix_mem_block_size, - -1, PAGE_KERNEL)); + -1, PAGE_KERNEL, ~0UL)); } - /* Find out how many PID bits are supported */ + map_kfence_pool(kfence_pool); + if (!cpu_has_feature(CPU_FTR_HVMODE) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { /* - * Older versions of KVM on these machines perfer if the + * Older versions of KVM on these machines prefer if the * guest only uses the low 19 PID bits. */ - if (!mmu_pid_bits) - mmu_pid_bits = 19; - } else { - if (!mmu_pid_bits) - mmu_pid_bits = 20; + mmu_pid_bits = 19; } mmu_base_pid = 1; @@ -449,11 +535,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; - /* Find MMU PID size */ - prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); - if (prop && size == 4) - mmu_pid_bits = be32_to_cpup(prop); - /* Grab page size encodings */ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); if (!prop) @@ -484,58 +565,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, return 1; } -#ifdef CONFIG_MEMORY_HOTPLUG -static int __init probe_memory_block_size(unsigned long node, const char *uname, int - depth, void *data) -{ - unsigned long *mem_block_size = (unsigned long *)data; - const __be32 *prop; - int len; - - if (depth != 1) - return 0; - - if (strcmp(uname, "ibm,dynamic-reconfiguration-memory")) - return 0; - - prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len); - - if (!prop || len < dt_root_size_cells * sizeof(__be32)) - /* - * Nothing in the device tree - */ - *mem_block_size = MIN_MEMORY_BLOCK_SIZE; - else - *mem_block_size = of_read_number(prop, dt_root_size_cells); - return 1; -} - -static unsigned long radix_memory_block_size(void) -{ - unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE; - - /* - * OPAL firmware feature is set by now. Hence we are ok - * to test OPAL feature. - */ - if (firmware_has_feature(FW_FEATURE_OPAL)) - mem_block_size = 1UL * 1024 * 1024 * 1024; - else - of_scan_flat_dt(probe_memory_block_size, &mem_block_size); - - return mem_block_size; -} - -#else /* CONFIG_MEMORY_HOTPLUG */ - -static unsigned long radix_memory_block_size(void) -{ - return 1UL * 1024 * 1024 * 1024; -} - -#endif /* CONFIG_MEMORY_HOTPLUG */ - - void __init radix__early_init_devtree(void) { int rc; @@ -559,51 +588,20 @@ void __init radix__early_init_devtree(void) mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize = psize_to_rpti_pgsize(MMU_PAGE_64K); } - - /* - * Max mapping size used when mapping pages. We don't use - * ppc_md.memory_block_size() here because this get called - * early and we don't have machine probe called yet. Also - * the pseries implementation only check for ibm,lmb-size. - * All hypervisor supporting radix do expose that device - * tree node. - */ - radix_mem_block_size = radix_memory_block_size(); return; } -static void radix_init_amor(void) -{ - /* - * In HV mode, we init AMOR (Authority Mask Override Register) so that - * the hypervisor and guest can setup IAMR (Instruction Authority Mask - * Register), enable key 0 and set it to 1. - * - * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) - */ - mtspr(SPRN_AMOR, (3ul << 62)); -} - void __init radix__early_init_mmu(void) { unsigned long lpcr; +#ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_PPC_64K_PAGES /* PAGE_SIZE mappings */ mmu_virtual_psize = MMU_PAGE_64K; #else mmu_virtual_psize = MMU_PAGE_4K; #endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } else - mmu_vmemmap_psize = mmu_virtual_psize; #endif /* * initialize page table size @@ -644,7 +642,6 @@ void __init radix__early_init_mmu(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); - radix_init_amor(); } else { radix_init_pseries(); } @@ -668,8 +665,6 @@ void radix__early_init_mmu_secondary(void) set_ptcr_when_no_uv(__pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - - radix_init_amor(); } radix__switch_mmu_context(NULL, &init_mm); @@ -679,7 +674,8 @@ void radix__early_init_mmu_secondary(void) mtspr(SPRN_UAMOR, 0); } -void radix__mmu_cleanup_all(void) +/* Called during kexec sequence with MMU off */ +notrace void radix__mmu_cleanup_all(void) { unsigned long lpcr; @@ -738,10 +734,60 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d) p4d_clear(p4d); } -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) { - unsigned long next; + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + return !vmemmap_populated(start, PMD_SIZE); +} + +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + + return !vmemmap_populated(start, PAGE_SIZE); + +} +#endif + +static void __meminit free_vmemmap_pages(struct page *page, + struct vmem_altmap *altmap, + int order) +{ + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); + + /* + * with 2M vmemmap mmaping we can have things setup + * such that even though atlmap is specified we never + * used altmap. + */ + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + return; + } + } + + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + __free_pages(page, order); +} + +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; pte_t *pte; pte = pte_start + pte_index(addr); @@ -753,23 +799,28 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, if (!pte_present(*pte)) continue; - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + pages++; } - - pte_clear(&init_mm, addr, pte); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_page_is_unused(addr, next)) { + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + } +#endif } + if (direct) + update_page_count(mmu_virtual_psize, -pages); } static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { - unsigned long next; + unsigned long next, pages = 0; pte_t *pte_base; pmd_t *pmd; @@ -780,26 +831,36 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, if (!pmd_present(*pmd)) continue; - if (pmd_is_leaf(*pmd)) { - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) { - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (pmd_leaf(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + pages++; } - pte_clear(&init_mm, addr, (pte_t *)pmd); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_pmd_is_unused(addr, next)) { + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + } +#endif continue; } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next); + remove_pte_table(pte_base, addr, next, direct, altmap); free_pte_table(pte_base, pmd); } + if (direct) + update_page_count(MMU_PAGE_2M, -pages); } static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { - unsigned long next; + unsigned long next, pages = 0; pmd_t *pmd_base; pud_t *pud; @@ -810,23 +871,28 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, if (!pud_present(*pud)) continue; - if (pud_is_leaf(*pud)) { + if (pud_leaf(*pud)) { if (!IS_ALIGNED(addr, PUD_SIZE) || !IS_ALIGNED(next, PUD_SIZE)) { WARN_ONCE(1, "%s: unaligned range\n", __func__); continue; } pte_clear(&init_mm, addr, (pte_t *)pud); + pages++; continue; } pmd_base = pud_pgtable(*pud); - remove_pmd_table(pmd_base, addr, next); + remove_pmd_table(pmd_base, addr, next, direct, altmap); free_pmd_table(pmd_base, pud); } + if (direct) + update_page_count(MMU_PAGE_1G, -pages); } -static void __meminit remove_pagetable(unsigned long start, unsigned long end) +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long addr, next; pud_t *pud_base; @@ -843,7 +909,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) if (!p4d_present(*p4d)) continue; - if (p4d_is_leaf(*p4d)) { + if (p4d_leaf(*p4d)) { if (!IS_ALIGNED(addr, P4D_SIZE) || !IS_ALIGNED(next, P4D_SIZE)) { WARN_ONCE(1, "%s: unaligned range\n", __func__); @@ -855,7 +921,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) } pud_base = p4d_pgtable(*p4d); - remove_pud_table(pud_base, addr, next); + remove_pud_table(pud_base, addr, next, direct, altmap); free_pud_table(pud_base, p4d); } @@ -873,12 +939,12 @@ int __meminit radix__create_section_mapping(unsigned long start, } return create_physical_mapping(__pa(start), __pa(end), - radix_mem_block_size, nid, prot); + nid, prot, ~0UL); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) { - remove_pagetable(start, end); + remove_pagetable(start, end, true, NULL); return 0; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -896,7 +962,6 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long phys) { /* Create a PTE encoding */ - unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); int ret; @@ -905,16 +970,456 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return -1; } - ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); + ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid); BUG_ON(ret); return 0; } +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + if (radix_enabled()) + return __vmemmap_can_optimize(altmap, pgmap); + + return false; +} +#endif + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_leaf(*pmdp); + + if (large) + vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); + + return large; +} + +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + pte_t *ptep = pmdp_ptep(pmdp); + + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, ptep, entry); + asm volatile("ptesync": : :"memory"); + + vmemmap_verify(ptep, node, addr, next); +} + +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, + int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmdp, addr); + + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) + altmap = NULL; + + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p && altmap) + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); + if (!p) + return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); + } + + VM_BUG_ON(!PAGE_ALIGNED(addr)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + asm volatile("ptesync": : :"memory"); + } + return pte; +} + +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, + unsigned long address) +{ + pud_t *pud; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(p4d_none(*p4dp))) { + if (unlikely(!slab_is_available())) { + pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + p4d_populate(&init_mm, p4dp, pud); + /* go to the pud_offset */ + } else + return pud_alloc(&init_mm, p4dp, address); + } + return pud_offset(p4dp, address); +} + +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, + unsigned long address) +{ + pmd_t *pmd; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pud_none(*pudp))) { + if (unlikely(!slab_is_available())) { + pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pud_populate(&init_mm, pudp, pmd); + } else + return pmd_alloc(&init_mm, pudp, address); + } + return pmd_offset(pudp, address); +} + +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, + unsigned long address) +{ + pte_t *pte; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pmd_none(*pmdp))) { + if (unlikely(!slab_is_available())) { + pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pmd_populate(&init_mm, pmdp, pte); + } else + return pte_alloc_kernel(pmdp, address); + } + return pte_offset_kernel(pmdp, address); +} + + + +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* + * If altmap is present, Make sure we align the start vmemmap addr + * to PAGE_SIZE so that we calculate the correct start_pfn in + * altmap boundary check to decide whether we should use altmap or + * RAM based backing memory allocation. Also the address need to be + * aligned for set_pte operation. If the start addr is already + * PMD_SIZE aligned and with in the altmap boundary then we will + * try to use a pmd size altmap mapping else we go for page size + * mapping. + * + * If altmap is not present, align the vmemmap addr to PMD_SIZE and + * always allocate a PMD size page for vmemmap backing. + * + */ + + if (altmap) + start = ALIGN_DOWN(start, PAGE_SIZE); + else + start = ALIGN_DOWN(start, PMD_SIZE); + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + /* + * keep it simple by checking addr PMD_SIZE alignment + * and verifying the device boundary condition. + * For us to use a pmd mapping, both addr and pfn should + * be aligned. We skip if addr is not aligned and for + * pfn we hope we have extra area in the altmap that + * can help to find an aligned block. This can result + * in altmap block allocation failures, in which case + * we fallback to RAM for vmemmap allocation. + */ + if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + goto base_mapping; + } + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); + continue; + } else { + /* + * A vmemmap block allocation can fail due to + * alignment requirements and we trying to align + * things aggressively there by running out of + * space. Try base mapping on failure. + */ + goto base_mapping; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) { + /* + * If a huge mapping exist due to early call to + * vmemmap_populate, let's try to use that. + */ + continue; + } +base_mapping: + /* + * Not able allocate higher order memory to back memmap + * or we found a pointer to pte page. Allocate base page + * size vmemmap + */ + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + + pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); + if (!pte) + return -ENOMEM; + + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + next = addr + PAGE_SIZE; + } + return 0; +} + +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return NULL; + radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, + unsigned long pfn_offset, int node) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long map_addr; + + /* the second vmemmap page which we use for duplication */ + map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; + pgd = pgd_offset_k(map_addr); + p4d = p4d_offset(pgd, map_addr); + pud = vmemmap_pud_alloc(p4d, node, map_addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, map_addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, map_addr); + if (!pte) + return NULL; + /* + * Check if there exist a mapping to the left + */ + if (pte_none(*pte)) { + /* + * Populate the head page vmemmap page. + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); + if (!pte) + return NULL; + /* + * Populate the tail pages vmemmap page + */ + pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); + if (!pte) + return NULL; + vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); + return pte; + } + return pte; +} + +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + /* + * we want to map things as base page size mapping so that + * we can save space in vmemmap. We could have huge mapping + * covering out both edges. + */ + unsigned long addr; + unsigned long addr_pfn = start_pfn; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_leaf(READ_ONCE(*pmd))) { + /* existing huge mapping. Skip the range */ + addr_pfn += (PMD_SIZE >> PAGE_SHIFT); + next = pmd_addr_end(addr, end); + continue; + } + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + /* + * This could be because we already have a compound + * page whose VMEMMAP_RESERVE_NR pages were mapped and + * this request fall in those pages. + */ + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } else { + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); + pte_t *tail_page_pte; + + /* + * if the address is aligned to huge page size it is the + * head mapping. + */ + if (pfn_offset == 0) { + /* Populate the head page vmemmap page */ + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + /* + * Populate the tail pages vmemmap page + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + addr_pfn += 2; + next = addr + 2 * PAGE_SIZE; + continue; + } + /* + * get the 2nd mapping details + * Also create it if that doesn't exist + */ + tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); + if (!tail_page_pte) { + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + } + return 0; +} + + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - remove_pagetable(start, start + page_size); + remove_pagetable(start, start + page_size, true, NULL); +} + +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + remove_pagetable(start, end, false, altmap); } #endif #endif @@ -928,12 +1433,29 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add unsigned long old; #ifdef CONFIG_DEBUG_VM - WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + WARN_ON(!radix__pmd_trans_huge(*pmdp)); assert_spin_locked(pmd_lockptr(mm, pmdp)); #endif - old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); - trace_hugepage_update(addr, old, clr, set); + old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); + trace_hugepage_update_pmd(addr, old, clr, set); + + return old; +} + +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_trans_huge(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update_pud(addr, old, clr, set); return old; } @@ -946,22 +1468,12 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); - VM_BUG_ON(pmd_devmap(*pmdp)); /* * khugepaged calls this for normal pmd */ pmd = *pmdp; pmd_clear(pmdp); - /* - * pmdp collapse_flush need to ensure that there are no parallel gup - * walk after this call. This is needed so that we can have stable - * page ref count when collapsing a page. We don't allow a collapse page - * if we have gup taken on the page. We can ensure that by sending IPI - * because gup walk happens with IRQ disabled. - */ - serialize_against_pte_lookup(vma->vm_mm); - radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); return pmd; @@ -1023,27 +1535,43 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, unsigned long address, int psize) { struct mm_struct *mm = vma->vm_mm; - unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | - _PAGE_RW | _PAGE_EXEC); + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY | + _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); unsigned long change = pte_val(entry) ^ pte_val(*ptep); /* - * To avoid NMMU hang while relaxing access, we need mark - * the pte invalid in between. + * On POWER9, the NMMU is not able to relax PTE access permissions + * for a translation with a TLB. The PTE must be invalidated, TLB + * flushed before the new PTE is installed. + * + * This only needs to be done for radix, because hash translation does + * flush when updating the linux pte (and we don't support NMMU + * accelerators on HPT on POWER9 anyway XXX: do we?). + * + * POWER10 (and P9P) NMMU does behave as per ISA. */ - if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { + if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) && + atomic_read(&mm->context.copros) > 0) { unsigned long old_pte, new_pte; old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); - /* - * new value of pte - */ new_pte = old_pte | set; radix__flush_tlb_page_psize(mm, address, psize); __radix_pte_update(ptep, _PAGE_INVALID, new_pte); @@ -1051,9 +1579,12 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, __radix_pte_update(ptep, 0, set); /* * Book3S does not require a TLB flush when relaxing access - * restrictions when the address space is not attached to a - * NMMU, because the core MMU will reload the pte after taking - * an access fault, which is defined by the architecture. + * restrictions when the address space (modulo the POWER9 nest + * MMU issue above) because the MMU will reload the PTE after + * taking an access fault, as defined by the architecture. See + * "Setting a Reference or Change Bit or Upgrading Access + * Authority (PTE Subject to Atomic Hardware Updates)" in + * Power ISA Version 3.1B. */ } /* See ptesync comment in radix__set_pte_at */ @@ -1066,11 +1597,12 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; /* - * To avoid NMMU hang while relaxing access we need to flush the tlb before - * we set the new value. We need to do this only for radix, because hash - * translation does flush when updating the linux pte. + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. */ - if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && (atomic_read(&mm->context.copros) > 0)) radix__flush_tlb_page(vma, addr); @@ -1092,7 +1624,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) int pud_clear_huge(pud_t *pud) { - if (pud_huge(*pud)) { + if (pud_leaf(*pud)) { pud_clear(pud); return 1; } @@ -1139,7 +1671,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) int pmd_clear_huge(pmd_t *pmd) { - if (pmd_huge(*pmd)) { + if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index aefc100d79a7..9e1f6558d026 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -10,6 +10,7 @@ #include <linux/memblock.h> #include <linux/mmu_context.h> #include <linux/sched/mm.h> +#include <linux/debugfs.h> #include <asm/ppc-opcode.h> #include <asm/tlb.h> @@ -126,21 +127,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbie_pid_lpid(unsigned long pid, - unsigned long lpid, - unsigned long ric) -{ - unsigned long rb, rs, prs, r; - - rb = PPC_BIT(53); /* IS = 1 */ - rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -201,23 +187,6 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid, - unsigned long lpid, - unsigned long ap, unsigned long ric) -{ - unsigned long rb, rs, prs, r; - - rb = va & ~(PPC_BITMASK(52, 63)); - rb |= ap << PPC_BITLSHIFT(58); - rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); - prs = 1; /* process scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); -} - static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long ap, unsigned long ric) { @@ -263,22 +232,6 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid, } } -static inline void fixup_tlbie_va_range_lpid(unsigned long va, - unsigned long pid, - unsigned long lpid, - unsigned long ap) -{ - if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); - } - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB); - } -} - static inline void fixup_tlbie_pid(unsigned long pid) { /* @@ -298,26 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid) } } -static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid) -{ - /* - * We can use any address for the invalidation, pick one which is - * probably unused as an optimisation. - */ - unsigned long va = ((1UL << 52) - 1); - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); - } - - if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { - asm volatile("ptesync" : : : "memory"); - __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K), - RIC_FLUSH_TLB); - } -} - static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long ap) { @@ -396,7 +329,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) /* * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint + * must be a compile-time constraint to match the "i" constraint * in the asm statement. */ switch (ric) { @@ -415,31 +348,6 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid, - unsigned long ric) -{ - asm volatile("ptesync" : : : "memory"); - - /* - * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint - * in the asm statement. - */ - switch (ric) { - case RIC_FLUSH_TLB: - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); - fixup_tlbie_pid_lpid(pid, lpid); - break; - case RIC_FLUSH_PWC: - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); - break; - case RIC_FLUSH_ALL: - default: - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); - fixup_tlbie_pid_lpid(pid, lpid); - } - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); -} struct tlbiel_pid { unsigned long pid; unsigned long ric; @@ -565,20 +473,6 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end, fixup_tlbie_va_range(addr - page_size, pid, ap); } -static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end, - unsigned long pid, unsigned long lpid, - unsigned long page_size, - unsigned long psize) -{ - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - - for (addr = start; addr < end; addr += page_size) - __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB); - - fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap); -} - static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, unsigned long psize, unsigned long ric) { @@ -659,18 +553,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end, - unsigned long pid, unsigned long lpid, - unsigned long page_size, - unsigned long psize, bool also_pwc) -{ - asm volatile("ptesync" : : : "memory"); - if (also_pwc) - __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); - __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); -} - static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, @@ -699,12 +581,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, */ void radix__local_flush_tlb_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_TLB); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); @@ -712,12 +595,13 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP void radix__local_flush_all_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_all_mm); @@ -731,12 +615,13 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -754,10 +639,18 @@ EXPORT_SYMBOL(radix__local_flush_tlb_page); static bool mm_needs_flush_escalation(struct mm_struct *mm) { /* - * P9 nest MMU has issues with the page walk cache - * caching PTEs and not flushing them properly when - * RIC = 0 for a PID/LPID invalidate + * The P9 nest MMU has issues with the page walk cache caching PTEs + * and not flushing them when RIC = 0 for a PID/LPID invalidate. + * + * This may have been fixed in shipping firmware (by disabling PWC + * or preventing it from caching PTEs), but until that is confirmed, + * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes + * to RIC=2. + * + * POWER10 (and P9P) does not have this problem. */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + return false; if (atomic_read(&mm->context.copros) > 0) return true; return false; @@ -783,12 +676,20 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) goto out; if (current->active_mm == mm) { + unsigned long flags; + WARN_ON_ONCE(current->mm != NULL); - /* Is a kernel thread and is using mm as the lazy tlb */ - mmgrab(&init_mm); + /* + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. + */ + local_irq_save(flags); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; switch_mm_irqs_off(mm, &init_mm, current); - mmdrop(mm); + mmdrop_lazy_tlb(mm); + local_irq_restore(flags); } /* @@ -800,7 +701,7 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) * that's what the caller expects. */ if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { - atomic_dec(&mm->context.active_cpus); + dec_mm_active_cpus(mm); cpumask_clear_cpu(cpu, mm_cpumask(mm)); always_flush = true; } @@ -936,7 +837,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -967,6 +868,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -976,7 +878,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -1000,6 +902,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1015,7 +918,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -1095,6 +998,9 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +/* + * Doesn't appear to be used anywhere. Remove. + */ #define TLB_FLUSH_ALL -1UL /* @@ -1106,8 +1012,8 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * invalidating a full PID, so it has a far lower threshold to change from * individual page flushes to full-pid flushes. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -1116,23 +1022,22 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool fullmm = (end == TLB_FLUSH_ALL); bool flush_pid, flush_pwc = false; enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - type = flush_type_needed(mm, fullmm); + type = flush_type_needed(mm, false); if (type == FLUSH_TYPE_NONE) goto out; - if (fullmm) - flush_pid = true; - else if (type == FLUSH_TYPE_GLOBAL) + if (type == FLUSH_TYPE_GLOBAL) flush_pid = nr_pages > tlb_single_page_flush_ceiling; else flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; @@ -1170,15 +1075,12 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } } } else { - bool hflush = false; + bool hflush; unsigned long hstart, hend; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - hstart = (start + PMD_SIZE - 1) & PMD_MASK; - hend = end & PMD_MASK; - if (hstart < hend) - hflush = true; - } + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend; if (type == FLUSH_TYPE_LOCAL) { asm volatile("ptesync": : :"memory"); @@ -1209,6 +1111,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1293,8 +1196,29 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (tlb->fullmm || tlb->need_flush_all) { - __flush_all_mm(mm, true); + if (tlb->fullmm) { + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * Shootdown based lazy tlb mm refcounting means we + * have to IPI everyone in the mm_cpumask anyway soon + * when the mm goes away, so might as well do it as + * part of the final flush now. + * + * If lazy shootdown was improved to reduce IPIs (e.g., + * by batching), then it may end up being better to use + * tlbies here instead. + */ + preempt_disable(); + + smp_mb(); /* see radix__flush_tlb_mm */ + exit_flush_lazy_tlbs(mm); + __flush_all_mm(mm, true); + + preempt_enable(); + } else { + __flush_all_mm(mm, true); + } + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->freed_tables) radix__flush_tlb_mm(mm); @@ -1316,25 +1240,22 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool fullmm = (end == TLB_FLUSH_ALL); bool flush_pid; enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; - fullmm = (end == TLB_FLUSH_ALL); + WARN_ON_ONCE(end == TLB_FLUSH_ALL); preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - type = flush_type_needed(mm, fullmm); + type = flush_type_needed(mm, false); if (type == FLUSH_TYPE_NONE) goto out; - if (fullmm) - flush_pid = true; - else if (type == FLUSH_TYPE_GLOBAL) + if (type == FLUSH_TYPE_GLOBAL) flush_pid = nr_pages > tlb_single_page_flush_ceiling; else flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; @@ -1376,6 +1297,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, @@ -1397,7 +1319,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; /* 4k page size, just blow the world */ @@ -1445,6 +1367,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; @@ -1470,6 +1399,127 @@ void radix__flush_tlb_all(void) } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static __always_inline void __tlbie_pid_lpid(unsigned long pid, + unsigned long lpid, + unsigned long ric) +{ + unsigned long rb, rs, prs, r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid, + unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb, rs, prs, r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K), + RIC_FLUSH_TLB); + } +} + +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid, + unsigned long ric) +{ + asm volatile("ptesync" : : : "memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + fixup_tlbie_pid_lpid(pid, lpid); + break; + case RIC_FLUSH_PWC: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + fixup_tlbie_pid_lpid(pid, lpid); + } + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +static inline void fixup_tlbie_va_range_lpid(unsigned long va, + unsigned long pid, + unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB); + } +} + +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap); +} + +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync" : : : "memory"); + if (also_pwc) + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + /* * Performs process-scoped invalidations for a given LPID * as part of H_RPT_INVALIDATE hcall. @@ -1524,3 +1574,14 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index c91bd85eb90e..15f73abd1506 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -9,11 +9,11 @@ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM */ -#include <asm/asm-prototypes.h> #include <asm/interrupt.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/paca.h> +#include <asm/lppaca.h> #include <asm/ppc-opcode.h> #include <asm/cputable.h> #include <asm/cacheflush.h> @@ -24,7 +24,7 @@ #include <linux/pgtable.h> #include <asm/udbg.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include "internal.h" @@ -42,6 +42,15 @@ early_param("stress_slb", parse_stress_slb); __ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key); +bool no_slb_preload __initdata; +static int __init parse_no_slb_preload(char *p) +{ + no_slb_preload = true; + return 0; +} +early_param("no_slb_preload", parse_no_slb_preload); +__ro_after_init DEFINE_STATIC_KEY_FALSE(no_slb_preload_key); + static void assert_slb_presence(bool present, unsigned long ea) { #ifdef CONFIG_DEBUG_VM @@ -294,11 +303,14 @@ static bool preload_hit(struct thread_info *ti, unsigned long esid) return false; } -static bool preload_add(struct thread_info *ti, unsigned long ea) +static void preload_add(struct thread_info *ti, unsigned long ea) { unsigned char idx; unsigned long esid; + if (slb_preload_disabled()) + return; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { /* EAs are stored >> 28 so 256MB segments don't need clearing */ if (ea & ESID_MASK_1T) @@ -308,7 +320,7 @@ static bool preload_add(struct thread_info *ti, unsigned long ea) esid = ea >> SID_SHIFT; if (preload_hit(ti, esid)) - return false; + return; idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; ti->slb_preload_esid[idx] = esid; @@ -316,8 +328,6 @@ static bool preload_add(struct thread_info *ti, unsigned long ea) ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; else ti->slb_preload_nr++; - - return true; } static void preload_age(struct thread_info *ti) @@ -328,94 +338,6 @@ static void preload_age(struct thread_info *ti) ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; } -void slb_setup_new_exec(void) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long exec = 0x10000000; - - WARN_ON(irqs_disabled()); - - /* - * preload cache can only be used to determine whether a SLB - * entry exists if it does not start to overflow. - */ - if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* - * We have no good place to clear the slb preload cache on exec, - * flush_thread is about the earliest arch hook but that happens - * after we switch to the mm and have aleady preloaded the SLBEs. - * - * For the most part that's probably okay to use entries from the - * previous exec, they will age out if unused. It may turn out to - * be an advantage to clear the cache before switching to it, - * however. - */ - - /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. - */ - if (!is_kernel_addr(exec)) { - if (preload_add(ti, exec)) - slb_allocate_user(mm, exec); - } - - /* Libraries and mmaps. */ - if (!is_kernel_addr(mm->mmap_base)) { - if (preload_add(ti, mm->mmap_base)) - slb_allocate_user(mm, mm->mmap_base); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - -void preload_new_slb_context(unsigned long start, unsigned long sp) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long heap = mm->start_brk; - - WARN_ON(irqs_disabled()); - - /* see above */ - if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* Userspace entry address. */ - if (!is_kernel_addr(start)) { - if (preload_add(ti, start)) - slb_allocate_user(mm, start); - } - - /* Top of stack, grows down. */ - if (!is_kernel_addr(sp)) { - if (preload_add(ti, sp)) - slb_allocate_user(mm, sp); - } - - /* Bottom of heap, grows up. */ - if (heap && !is_kernel_addr(heap)) { - if (preload_add(ti, heap)) - slb_allocate_user(mm, heap); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - static void slb_cache_slbie_kernel(unsigned int index) { unsigned long slbie_data = get_paca()->slb_cache[index]; @@ -502,6 +424,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) copy_mm_to_paca(mm); + if (slb_preload_disabled()) + return; + /* * We gradually age out SLBs after a number of context switches to * reduce reload overhead of unused entries (like we do with FP/VEC @@ -616,7 +541,7 @@ static void slb_cache_update(unsigned long esid_data) } else { /* * Our cache is full and the current cache content strictly - * doesn't indicate the active SLB conents. Bump the ptr + * doesn't indicate the active SLB contents. Bump the ptr * so that switch_slb() will ignore the cache. */ local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; @@ -822,7 +747,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); - if (unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) return -EINVAL; /* @@ -868,19 +793,3 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) return err; } } - -DEFINE_INTERRUPT_HANDLER(do_bad_slb_fault) -{ - int err = regs->result; - - if (err == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); - else - bad_page_fault(regs, SIGSEGV); - } else if (err == -EINVAL) { - unrecoverable_exception(regs); - } else { - BUG(); - } -} diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/book3s64/slice.c index 82b45b1cb973..28bec5bc7879 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -22,7 +22,7 @@ #include <linux/security.h> #include <asm/mman.h> #include <asm/mmu.h> -#include <asm/copro.h> +#include <asm/spu.h> #include <asm/hugetlb.h> #include <asm/mmu_context.h> @@ -248,7 +248,9 @@ static void slice_convert(struct mm_struct *mm, spin_unlock_irqrestore(&slice_convert_lock, flags); - copro_flush_all_slbs(mm); +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif } /* @@ -276,20 +278,16 @@ static bool slice_scan_available(unsigned long addr, } static unsigned long slice_find_area_bottomup(struct mm_struct *mm, - unsigned long len, + unsigned long addr, unsigned long len, const struct slice_mask *available, int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, next_end; - struct vm_unmapped_area_info info; - - info.flags = 0; - info.length = len; - info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); - info.align_offset = 0; - - addr = TASK_UNMAPPED_BASE; + unsigned long found, next_end; + struct vm_unmapped_area_info info = { + .length = len, + .align_mask = PAGE_MASK & ((1ul << pshift) - 1), + }; /* * Check till the allow max value for this mmap request */ @@ -322,21 +320,19 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, } static unsigned long slice_find_area_topdown(struct mm_struct *mm, - unsigned long len, + unsigned long addr, unsigned long len, const struct slice_mask *available, int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, prev; - struct vm_unmapped_area_info info; + unsigned long found, prev; + struct vm_unmapped_area_info info = { + .flags = VM_UNMAPPED_AREA_TOPDOWN, + .length = len, + .align_mask = PAGE_MASK & ((1ul << pshift) - 1), + }; unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); - info.align_offset = 0; - - addr = mm->mmap_base; /* * If we are trying to allocate above DEFAULT_MAP_WINDOW * Add the different to the mmap_base. @@ -377,7 +373,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * can happen with large stack limits and large mmap() * allocations. */ - return slice_find_area_bottomup(mm, len, available, psize, high_limit); + return slice_find_area_bottomup(mm, TASK_UNMAPPED_BASE, len, available, psize, high_limit); } @@ -386,9 +382,9 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, int topdown, unsigned long high_limit) { if (topdown) - return slice_find_area_topdown(mm, len, mask, psize, high_limit); + return slice_find_area_topdown(mm, mm->mmap_base, len, mask, psize, high_limit); else - return slice_find_area_bottomup(mm, len, mask, psize, high_limit); + return slice_find_area_bottomup(mm, mm->mmap_base, len, mask, psize, high_limit); } static inline void slice_copy_mask(struct slice_mask *dst, @@ -639,24 +635,58 @@ return_addr: } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); +#ifdef CONFIG_HUGETLB_PAGE +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + + return shift_to_mmu_psize(huge_page_shift(hstate)); +} +#else +static int file_to_psize(struct file *file) +{ + return 0; +} +#endif + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, + vm_flags_t vm_flags) { - return slice_get_unmapped_area(addr, len, flags, - mm_ctx_user_psize(¤t->mm->context), 0); + unsigned int psize; + + if (radix_enabled()) + return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); + + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr, len, flags, psize, 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long flags, + vm_flags_t vm_flags) { - return slice_get_unmapped_area(addr0, len, flags, - mm_ctx_user_psize(¤t->mm->context), 1); + unsigned int psize; + + if (radix_enabled()) + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags); + + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr0, len, flags, psize, 1); } unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) @@ -712,7 +742,6 @@ void slice_init_new_context_exec(struct mm_struct *mm) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); } -#ifdef CONFIG_PPC_BOOK3S_64 void slice_setup_new_exec(void) { struct mm_struct *mm = current->mm; @@ -724,7 +753,6 @@ void slice_setup_new_exec(void) mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); } -#endif void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) @@ -779,4 +807,13 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, return !slice_check_range_fits(mm, maskp, addr, len); } + +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + /* With radix we don't use slice, so derive it from vma*/ + if (radix_enabled()) + return vma_kernel_pagesize(vma); + + return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); +} #endif diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 60c6ea16a972..ec98e526167e 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -71,6 +71,8 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, if (pmd_none(*pmd)) return; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return; arch_enter_lazy_mmu_mode(); for (; npages > 0; --npages) { pte_update(mm, addr, pte, 0, 0, 0); @@ -143,30 +145,22 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops subpage_walk_ops = { .pmd_entry = subpage_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; - vma->vm_flags |= VM_NOHUGEPAGE; + for_each_vma_range(vmi, vma, addr + len) { + vm_flags_set(vma, VM_NOHUGEPAGE); walk_page_vma(vma, &subpage_walk_ops, NULL); - vma = vma->vm_next; } } #else diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c new file mode 100644 index 000000000000..ccd64b5e6cac --- /dev/null +++ b/arch/powerpc/mm/book3s64/trace.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file is for defining trace points and trace related helpers. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#include <trace/events/thp.h> +#endif diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c index 63363787e000..7186516eca52 100644 --- a/arch/powerpc/mm/cacheflush.c +++ b/arch/powerpc/mm/cacheflush.c @@ -12,7 +12,7 @@ static inline bool flush_coherent_icache(void) /* * For a snooping icache, we still need a dummy icbi to purge all the * prefetched instructions from the ifetch buffers. We also need a sync - * before the icbi to order the the actual stores to memory that might + * before the icbi to order the actual stores to memory that might * have modified instructions with the icbi. */ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { @@ -78,7 +78,7 @@ EXPORT_SYMBOL(flush_icache_range); #ifdef CONFIG_HIGHMEM /** - * flush_dcache_icache_phys() - Flush a page by it's physical address + * flush_dcache_icache_phys() - Flush a page by its physical address * @physaddr: the physical address of the page */ static void flush_dcache_icache_phys(unsigned long physaddr) @@ -148,44 +148,31 @@ static void __flush_dcache_icache(void *p) invalidate_icache_range(addr, addr + PAGE_SIZE); } -static void flush_dcache_icache_hugepage(struct page *page) +void flush_dcache_icache_folio(struct folio *folio) { - int i; - int nr = compound_nr(page); + unsigned int i, nr = folio_nr_pages(folio); - if (!PageHighMem(page)) { + if (flush_coherent_icache()) + return; + + if (!folio_test_highmem(folio)) { + void *addr = folio_address(folio); for (i = 0; i < nr; i++) - __flush_dcache_icache(lowmem_page_address(page + i)); - } else { + __flush_dcache_icache(addr + i * PAGE_SIZE); + } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { for (i = 0; i < nr; i++) { - void *start = kmap_local_page(page + i); + void *start = kmap_local_folio(folio, i * PAGE_SIZE); __flush_dcache_icache(start); kunmap_local(start); } - } -} - -void flush_dcache_icache_page(struct page *page) -{ - if (flush_coherent_icache()) - return; - - if (PageCompound(page)) - return flush_dcache_icache_hugepage(page); - - if (!PageHighMem(page)) { - __flush_dcache_icache(lowmem_page_address(page)); - } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { - void *start = kmap_local_page(page); - - __flush_dcache_icache(start); - kunmap_local(start); } else { - flush_dcache_icache_phys(page_to_phys(page)); + unsigned long pfn = folio_pfn(folio); + for (i = 0; i < nr; i++) + flush_dcache_icache_phys((pfn + i) * PAGE_SIZE); } } -EXPORT_SYMBOL(flush_dcache_icache_page); +EXPORT_SYMBOL(flush_dcache_icache_folio); void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 8acd00178956..f5f8692e2c69 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -12,8 +12,6 @@ #include <linux/export.h> #include <asm/reg.h> #include <asm/copro.h> -#include <asm/spu.h> -#include <misc/cxl-base.h> /* * This ought to be kept in sync with the powerpc specific do_page_fault @@ -33,19 +31,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, if (mm->pgd == NULL) return -EFAULT; - mmap_read_lock(mm); - ret = -EFAULT; - vma = find_vma(mm, ea); + vma = lock_mm_and_find_vma(mm, ea, NULL); if (!vma) - goto out_unlock; - - if (ea < vma->vm_start) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_unlock; - if (expand_stack(vma, ea)) - goto out_unlock; - } + return -EFAULT; + ret = -EFAULT; is_write = dsisr & DSISR_ISSTORE; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) @@ -65,6 +55,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, ret = 0; *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL); + + /* The fault is fully completed (including releasing mmap lock) */ + if (*flt & VM_FAULT_COMPLETED) + return 0; + if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; @@ -82,6 +77,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(copro_handle_mm_fault); +#ifdef CONFIG_PPC_64S_HASH_MMU int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) { u64 vsid, vsidkey; @@ -137,12 +133,4 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) return 0; } EXPORT_SYMBOL_GPL(copro_calculate_slb); - -void copro_flush_all_slbs(struct mm_struct *mm) -{ -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); #endif - cxl_slbia(mm); -} -EXPORT_SYMBOL_GPL(copro_flush_all_slbs); diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 9af3832c9d8d..8dd7b340d51f 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -11,13 +11,14 @@ #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/memblock.h> -#include <asm/prom.h> +#include <linux/slab.h> #include <asm/drmem.h> static int n_root_addr_cells, n_root_size_cells; static struct drmem_lmb_info __drmem_info; struct drmem_lmb_info *drmem_info = &__drmem_info; +static bool in_drmem_update; u64 drmem_lmb_memory_max(void) { @@ -66,7 +67,7 @@ static int drmem_update_dt_v1(struct device_node *memory, struct property *new_prop; struct of_drconf_cell_v1 *dr_cell; struct drmem_lmb *lmb; - u32 *p; + __be32 *p; new_prop = clone_property(prop, prop->length); if (!new_prop) @@ -178,6 +179,11 @@ int drmem_update_dt(void) if (!memory) return -1; + /* + * Set in_drmem_update to prevent the notifier callback to process the + * DT property back since the change is coming from the LMB tree. + */ + in_drmem_update = true; prop = of_find_property(memory, "ibm,dynamic-memory", NULL); if (prop) { rc = drmem_update_dt_v1(memory, prop); @@ -186,6 +192,7 @@ int drmem_update_dt(void) if (prop) rc = drmem_update_dt_v2(memory, prop); } + in_drmem_update = false; of_node_put(memory); return rc; @@ -307,6 +314,45 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data, return ret; } +/* + * Update the LMB associativity index. + */ +static int update_lmb(struct drmem_lmb *updated_lmb, + __maybe_unused const __be32 **usm, + __maybe_unused void *data) +{ + struct drmem_lmb *lmb; + + for_each_drmem_lmb(lmb) { + if (lmb->drc_index != updated_lmb->drc_index) + continue; + + lmb->aa_index = updated_lmb->aa_index; + break; + } + return 0; +} + +/* + * Update the LMB associativity index. + * + * This needs to be called when the hypervisor is updating the + * dynamic-reconfiguration-memory node property. + */ +void drmem_update_lmbs(struct property *prop) +{ + /* + * Don't update the LMBs if triggered by the update done in + * drmem_update_dt(), the LMB values have been used to the update the DT + * property in that case. + */ + if (in_drmem_update) + return; + if (!strcmp(prop->name, "ibm,dynamic-memory")) + __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb); + else if (!strcmp(prop->name, "ibm,dynamic-memory-v2")) + __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb); +} #endif static int init_drmem_lmb_size(struct device_node *dn) @@ -347,17 +393,17 @@ static const __be32 *of_get_usable_memory(struct device_node *dn) int walk_drmem_lmbs(struct device_node *dn, void *data, int (*func)(struct drmem_lmb *, const __be32 **, void *)) { + struct device_node *root = of_find_node_by_path("/"); const __be32 *prop, *usm; int ret = -ENODEV; - if (!of_root) + if (!root) return ret; /* Get the address & size cells */ - of_node_get(of_root); - n_root_addr_cells = of_n_addr_cells(of_root); - n_root_size_cells = of_n_size_cells(of_root); - of_node_put(of_root); + n_root_addr_cells = of_n_addr_cells(root); + n_root_size_cells = of_n_size_cells(root); + of_node_put(root); if (init_drmem_lmb_size(dn)) return ret; @@ -445,10 +491,8 @@ static int __init drmem_init(void) const __be32 *prop; dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (!dn) { - pr_info("No dynamic reconfiguration memory found\n"); + if (!dn) return 0; - } if (init_drmem_lmb_size(dn)) { of_node_put(dn); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index a8d0ce85d39a..806c74e0d5ab 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/types.h> #include <linux/pagemap.h> #include <linux/ptrace.h> @@ -71,28 +72,26 @@ static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long add return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); } -static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) +static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, + struct mm_struct *mm, struct vm_area_struct *vma) { - struct mm_struct *mm = current->mm; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ - mmap_read_unlock(mm); + if (mm) + mmap_read_unlock(mm); + else + vma_end_read(vma); return __bad_area_nosemaphore(regs, address, si_code); } -static noinline int bad_area(struct pt_regs *regs, unsigned long address) -{ - return __bad_area(regs, address, SEGV_MAPERR); -} - static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, + struct mm_struct *mm, struct vm_area_struct *vma) { - struct mm_struct *mm = current->mm; int pkey; /* @@ -114,7 +113,10 @@ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, */ pkey = vma_pkey(vma); - mmap_read_unlock(mm); + if (mm) + mmap_read_unlock(mm); + else + vma_end_read(vma); /* * If we are in kernel mode, bail out with a SEGV, this will @@ -129,9 +131,10 @@ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, return 0; } -static noinline int bad_access(struct pt_regs *regs, unsigned long address) +static noinline int bad_access(struct pt_regs *regs, unsigned long address, + struct mm_struct *mm, struct vm_area_struct *vma) { - return __bad_area(regs, address, SEGV_ACCERR); + return __bad_area(regs, address, SEGV_ACCERR, mm, vma); } static int do_sigbus(struct pt_regs *regs, unsigned long address, @@ -216,7 +219,7 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, // Read/write fault blocked by KUAP is bad, it can never succeed. if (bad_kuap_fault(regs, address, is_write)) { pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n", - is_write ? "write" : "read", address, + str_write_read(is_write), address, from_kuid(&init_user_ns, current_uid())); // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad @@ -270,8 +273,18 @@ static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma return false; } + /* + * VM_READ, VM_WRITE and VM_EXEC may imply read permissions, as + * defined in protection_map[]. In that case Read faults can only be + * caused by a PROT_NONE mapping. However a non exec access on a + * VM_EXEC only mapping is invalid anyway, so report it as such. + */ if (unlikely(!vma_is_accessible(vma))) return true; + + if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC) + return true; + /* * We should ideally do the vma pkey access check here. But in the * fault path, handle_mm_fault() also does the same check. To avoid @@ -356,18 +369,33 @@ static void sanity_check_fault(bool is_write, bool is_user, * Define the correct "is_write" bit in error_code based * on the processor family */ -#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) +#ifdef CONFIG_BOOKE #define page_fault_is_write(__err) ((__err) & ESR_DST) #else #define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE) #endif -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) +#ifdef CONFIG_BOOKE #define page_fault_is_bad(__err) (0) #elif defined(CONFIG_PPC_8xx) #define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) #elif defined(CONFIG_PPC64) -#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S) +static int page_fault_is_bad(unsigned long err) +{ + unsigned long flag = DSISR_BAD_FAULT_64S; + + /* + * PAPR+ v2.11 § 14.15.3.4.1 (unreleased) + * If byte 0, bit 3 of pi-attribute-specifier-type in + * ibm,pi-features property is defined, ignore the DSI error + * which is caused by the paste instruction on the + * suspended NX window. + */ + if (mmu_has_feature(MMU_FTR_NX_DSI)) + flag &= ~DSISR_BAD_COPYPASTE; + + return err & flag; +} #else #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) #endif @@ -412,10 +440,16 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, /* * The kernel should never take an execute fault nor should it * take a page fault to a kernel address or a page fault to a user - * address outside of dedicated places + * address outside of dedicated places. + * + * Rather than kfence directly reporting false negatives, search whether + * the NIP belongs to the fixup table for cases where fault could come + * from functions like copy_from_kernel_nofault(). */ if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) { - if (kfence_handle_page_fault(address, is_write, regs)) + if (is_kfence_address((void *)address) && + !search_exception_tables(instruction_pointer(regs)) && + kfence_handle_page_fault(address, is_write, regs)) return 0; return SIGSEGV; @@ -450,6 +484,41 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return bad_access_pkey(regs, address, NULL, vma); + } + + if (unlikely(access_error(is_write, is_exec, vma))) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return bad_access(regs, address, NULL, vma); + } + + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + + if (fault_signal_pending(fault, regs)) + return user_mode(regs) ? 0 : SIGBUS; + +lock_mmap: + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -457,47 +526,19 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * exceptions table. lock_mm_and_find_vma() handles that logic. */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!is_user && !search_exception_tables(regs->nip)) - return bad_area_nosemaphore(regs, address); - retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) - return bad_area(regs, address); - - if (unlikely(vma->vm_start > address)) { - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - return bad_area(regs, address); - - if (unlikely(expand_stack(vma, address))) - return bad_area(regs, address); - } + return bad_area_nosemaphore(regs, address); if (unlikely(access_pkey_error(is_write, is_exec, (error_code & DSISR_KEYFAULT), vma))) - return bad_access_pkey(regs, address, vma); + return bad_access_pkey(regs, address, mm, vma); if (unlikely(access_error(is_write, is_exec, vma))) - return bad_access(regs, address); + return bad_access(regs, address, mm, vma); /* * If for any reason at all we couldn't handle the fault, @@ -511,22 +552,26 @@ retry: if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + goto out; + /* * Handle the retry right now, the mmap_lock has been released in that * case. */ if (unlikely(fault & VM_FAULT_RETRY)) { - if (flags & FAULT_FLAG_ALLOW_RETRY) { - flags |= FAULT_FLAG_TRIED; - goto retry; - } + flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(current->mm); +done: if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); +out: /* * Major/minor page fault accounting. */ @@ -568,17 +613,23 @@ NOKPROBE_SYMBOL(hash__do_page_fault); static void __bad_page_fault(struct pt_regs *regs, int sig) { int is_write = page_fault_is_write(regs->dsisr); + const char *msg; /* kernel has accessed a bad area */ + if (regs->dar < PAGE_SIZE) + msg = "Kernel NULL pointer dereference"; + else + msg = "Unable to handle kernel data access"; + switch (TRAP(regs)) { case INTERRUPT_DATA_STORAGE: - case INTERRUPT_DATA_SEGMENT: case INTERRUPT_H_DATA_STORAGE: - pr_alert("BUG: %s on %s at 0x%08lx\n", - regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" : - "Unable to handle kernel data access", - is_write ? "write" : "read", regs->dar); + pr_alert("BUG: %s on %s at 0x%08lx\n", msg, + str_write_read(is_write), regs->dar); + break; + case INTERRUPT_DATA_SEGMENT: + pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar); break; case INTERRUPT_INST_STORAGE: case INTERRUPT_INST_SEGMENT: @@ -620,4 +671,27 @@ DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv) { bad_page_fault(regs, SIGSEGV); } + +/* + * In radix, segment interrupts indicate the EA is not addressable by the + * page table geometry, so they are always sent here. + * + * In hash, this is called if do_slb_fault returns error. Typically it is + * because the EA was outside the region allowed by software. + */ +DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt) +{ + int err = regs->result; + + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); + else + bad_page_fault(regs, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9a75ba078e1b..d3c1b749dcfc 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -24,11 +24,10 @@ #include <asm/setup.h> #include <asm/hugetlb.h> #include <asm/pte-walk.h> +#include <asm/firmware.h> bool hugetlb_disabled = false; -#define hugepd_none(hpd) (hpd_val(hpd) == 0) - #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \ __builtin_ffs(sizeof(void *))) @@ -41,156 +40,43 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long s return __find_linux_pte(mm->pgd, addr, NULL, NULL); } -static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, - unsigned long address, unsigned int pdshift, - unsigned int pshift, spinlock_t *ptl) +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { - struct kmem_cache *cachep; - pte_t *new; - int i; - int num_hugepd; - - if (pshift >= pdshift) { - cachep = PGT_CACHE(PTE_T_ORDER); - num_hugepd = 1 << (pshift - pdshift); - } else { - cachep = PGT_CACHE(pdshift - pshift); - num_hugepd = 1; - } - - if (!cachep) { - WARN_ONCE(1, "No page table cache created for hugetlb tables"); - return -ENOMEM; - } - - new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; - BUG_ON(pshift > HUGEPD_SHIFT_MASK); - BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); + addr &= ~(sz - 1); - if (!new) - return -ENOMEM; + p4d = p4d_offset(pgd_offset(mm, addr), addr); + if (!mm_pud_folded(mm) && sz >= P4D_SIZE) + return (pte_t *)p4d; - /* - * Make sure other cpus find the hugepd set only after a - * properly initialized page table is visible to them. - * For more details look for comment in __pte_alloc(). - */ - smp_wmb(); + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + if (!mm_pmd_folded(mm) && sz >= PUD_SIZE) + return (pte_t *)pud; - spin_lock(ptl); - /* - * We have multiple higher-level entries that point to the same - * actual pte location. Fill in each as we go and backtrack on error. - * We need all of these so the DTLB pgtable walk code can find the - * right higher-level entry without knowing if it's a hugepage or not. - */ - for (i = 0; i < num_hugepd; i++, hpdp++) { - if (unlikely(!hugepd_none(*hpdp))) - break; - hugepd_populate(hpdp, new, pshift); - } - /* If we bailed from the for loop early, an error occurred, clean up */ - if (i < num_hugepd) { - for (i = i - 1 ; i >= 0; i--, hpdp--) - *hpdp = __hugepd(0); - kmem_cache_free(cachep, new); - } else { - kmemleak_ignore(new); - } - spin_unlock(ptl); - return 0; -} + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; -/* - * At this point we do the placement change only for BOOK3S 64. This would - * possibly work on other subarchs. - */ -pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long sz) -{ - pgd_t *pg; - p4d_t *p4; - pud_t *pu; - pmd_t *pm; - hugepd_t *hpdp = NULL; - unsigned pshift = __ffs(sz); - unsigned pdshift = PGDIR_SHIFT; - spinlock_t *ptl; - - addr &= ~(sz-1); - pg = pgd_offset(mm, addr); - p4 = p4d_offset(pg, addr); + if (sz >= PMD_SIZE) { + /* On 8xx, all hugepages are handled as contiguous PTEs */ + if (IS_ENABLED(CONFIG_PPC_8xx)) { + int i; -#ifdef CONFIG_PPC_BOOK3S_64 - if (pshift == PGDIR_SHIFT) - /* 16GB huge page */ - return (pte_t *) p4; - else if (pshift > PUD_SHIFT) { - /* - * We need to use hugepd table - */ - ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)p4; - } else { - pdshift = PUD_SHIFT; - pu = pud_alloc(mm, p4, addr); - if (!pu) - return NULL; - if (pshift == PUD_SHIFT) - return (pte_t *)pu; - else if (pshift > PMD_SHIFT) { - ptl = pud_lockptr(mm, pu); - hpdp = (hugepd_t *)pu; - } else { - pdshift = PMD_SHIFT; - pm = pmd_alloc(mm, pu, addr); - if (!pm) - return NULL; - if (pshift == PMD_SHIFT) - /* 16MB hugepage */ - return (pte_t *)pm; - else { - ptl = pmd_lockptr(mm, pm); - hpdp = (hugepd_t *)pm; + for (i = 0; i < sz / PMD_SIZE; i++) { + if (!pte_alloc_huge(mm, pmd + i, addr)) + return NULL; } } + return (pte_t *)pmd; } -#else - if (pshift >= PGDIR_SHIFT) { - ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)p4; - } else { - pdshift = PUD_SHIFT; - pu = pud_alloc(mm, p4, addr); - if (!pu) - return NULL; - if (pshift >= PUD_SHIFT) { - ptl = pud_lockptr(mm, pu); - hpdp = (hugepd_t *)pu; - } else { - pdshift = PMD_SHIFT; - pm = pmd_alloc(mm, pu, addr); - if (!pm) - return NULL; - ptl = pmd_lockptr(mm, pm); - hpdp = (hugepd_t *)pm; - } - } -#endif - if (!hpdp) - return NULL; - - if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) - return pte_alloc_map(mm, (pmd_t *)hpdp, addr); - - BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); - - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, - pdshift, pshift, ptl)) - return NULL; - return hugepte_offset(*hpdp, addr, pdshift); + return pte_alloc_huge(mm, pmd, addr); } #ifdef CONFIG_PPC_BOOK3S_64 @@ -225,344 +111,27 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) return 0; m = phys_to_virt(gpage_freearray[--nr_gpages]); gpage_freearray[nr_gpages] = 0; - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[0]); m->hstate = hstate; + m->flags = 0; return 1; } -#endif - - -int __init alloc_bootmem_huge_page(struct hstate *h) -{ - -#ifdef CONFIG_PPC_BOOK3S_64 - if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) - return pseries_alloc_bootmem_huge_page(h); -#endif - return __alloc_bootmem_huge_page(h); -} - -#ifndef CONFIG_PPC_BOOK3S_64 -#define HUGEPD_FREELIST_SIZE \ - ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) - -struct hugepd_freelist { - struct rcu_head rcu; - unsigned int index; - void *ptes[]; -}; -static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); - -static void hugepd_free_rcu_callback(struct rcu_head *head) +bool __init hugetlb_node_alloc_supported(void) { - struct hugepd_freelist *batch = - container_of(head, struct hugepd_freelist, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]); - - free_page((unsigned long)batch); + return false; } - -static void hugepd_free(struct mmu_gather *tlb, void *hugepte) -{ - struct hugepd_freelist **batchp; - - batchp = &get_cpu_var(hugepd_freelist_cur); - - if (atomic_read(&tlb->mm->mm_users) < 2 || - mm_is_thread_local(tlb->mm)) { - kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte); - put_cpu_var(hugepd_freelist_cur); - return; - } - - if (*batchp == NULL) { - *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); - (*batchp)->index = 0; - } - - (*batchp)->ptes[(*batchp)->index++] = hugepte; - if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { - call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback); - *batchp = NULL; - } - put_cpu_var(hugepd_freelist_cur); -} -#else -static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} #endif -/* Return true when the entry to be freed maps more than the area being freed */ -static bool range_is_outside_limits(unsigned long start, unsigned long end, - unsigned long floor, unsigned long ceiling, - unsigned long mask) -{ - if ((start & mask) < floor) - return true; - if (ceiling) { - ceiling &= mask; - if (!ceiling) - return true; - } - return end - 1 > ceiling - 1; -} -static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, - unsigned long start, unsigned long end, - unsigned long floor, unsigned long ceiling) +int __init alloc_bootmem_huge_page(struct hstate *h, int nid) { - pte_t *hugepte = hugepd_page(*hpdp); - int i; - - unsigned long pdmask = ~((1UL << pdshift) - 1); - unsigned int num_hugepd = 1; - unsigned int shift = hugepd_shift(*hpdp); - - /* Note: On fsl the hpdp may be the first of several */ - if (shift > pdshift) - num_hugepd = 1 << (shift - pdshift); - if (range_is_outside_limits(start, end, floor, ceiling, pdmask)) - return; - - for (i = 0; i < num_hugepd; i++, hpdp++) - *hpdp = __hugepd(0); - - if (shift >= pdshift) - hugepd_free(tlb, hugepte); - else - pgtable_free_tlb(tlb, hugepte, - get_hugepd_cache_index(pdshift - shift)); -} - -static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pgtable_t token = pmd_pgtable(*pmd); - - if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK)) - return; - - pmd_clear(pmd); - pte_free_tlb(tlb, token, addr); - mm_dec_nr_ptes(tlb->mm); -} - -static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pmd_t *pmd; - unsigned long next; - unsigned long start; - - start = addr; - do { - unsigned long more; - - pmd = pmd_offset(pud, addr); - next = pmd_addr_end(addr, end); - if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { - if (pmd_none_or_clear_bad(pmd)) - continue; - - /* - * if it is not hugepd pointer, we should already find - * it cleared. - */ - WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx)); - - hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling); - - continue; - } - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at this level for a - * single hugepage, but all of them point to - * the same kmem cache that holds the hugepte. - */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); - if (more > next) - next = more; - - free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, - addr, next, floor, ceiling); - } while (addr = next, addr != end); - - if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK)) - return; - - pmd = pmd_offset(pud, start & PUD_MASK); - pud_clear(pud); - pmd_free_tlb(tlb, pmd, start & PUD_MASK); - mm_dec_nr_pmds(tlb->mm); -} - -static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pud_t *pud; - unsigned long next; - unsigned long start; - - start = addr; - do { - pud = pud_offset(p4d, addr); - next = pud_addr_end(addr, end); - if (!is_hugepd(__hugepd(pud_val(*pud)))) { - if (pud_none_or_clear_bad(pud)) - continue; - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, - ceiling); - } else { - unsigned long more; - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at this level for a - * single hugepage, but all of them point to - * the same kmem cache that holds the hugepte. - */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); - if (more > next) - next = more; - - free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, - addr, next, floor, ceiling); - } - } while (addr = next, addr != end); - - if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK)) - return; - - pud = pud_offset(p4d, start & PGDIR_MASK); - p4d_clear(p4d); - pud_free_tlb(tlb, pud, start & PGDIR_MASK); - mm_dec_nr_puds(tlb->mm); -} - -/* - * This function frees user-level page tables of a process. - */ -void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pgd_t *pgd; - p4d_t *p4d; - unsigned long next; - - /* - * Because there are a number of different possible pagetable - * layouts for hugepage ranges, we limit knowledge of how - * things should be laid out to the allocation path - * (huge_pte_alloc(), above). Everything else works out the - * structure as it goes from information in the hugepd - * pointers. That means that we can't here use the - * optimization used in the normal page free_pgd_range(), of - * checking whether we're actually covering a large enough - * range to have to do anything at the top level of the walk - * instead of at the bottom. - * - * To make sense of this, you should probably go read the big - * block comment at the top of the normal free_pgd_range(), - * too. - */ - - do { - next = pgd_addr_end(addr, end); - pgd = pgd_offset(tlb->mm, addr); - p4d = p4d_offset(pgd, addr); - if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { - if (p4d_none_or_clear_bad(p4d)) - continue; - hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); - } else { - unsigned long more; - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at the pgd level - * for a single hugepage, but all of them point to the - * same kmem cache that holds the hugepte. - */ - more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); - if (more > next) - next = more; - - free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, - addr, next, floor, ceiling); - } - } while (addr = next, addr != end); -} - -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift) -{ - pte_t *ptep; - spinlock_t *ptl; - struct page *page = NULL; - unsigned long mask; - int shift = hugepd_shift(hpd); - struct mm_struct *mm = vma->vm_mm; - -retry: - /* - * hugepage directory entries are protected by mm->page_table_lock - * Use this instead of huge_pte_lockptr - */ - ptl = &mm->page_table_lock; - spin_lock(ptl); - - ptep = hugepte_offset(hpd, address, pdshift); - if (pte_present(*ptep)) { - mask = (1UL << shift) - 1; - page = pte_page(*ptep); - page += ((address & mask) >> PAGE_SHIFT); - if (flags & FOLL_GET) - get_page(page); - } else { - if (is_hugetlb_entry_migration(*ptep)) { - spin_unlock(ptl); - __migration_entry_wait(mm, ptep, ptl); - goto retry; - } - } - spin_unlock(ptl); - return page; -} - -#ifdef CONFIG_PPC_MM_SLICES -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct hstate *hstate = hstate_file(file); - int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - -#ifdef CONFIG_PPC_RADIX_MMU - if (radix_enabled()) - return radix__hugetlb_get_unmapped_area(file, addr, len, - pgoff, flags); -#endif - return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); -} +#ifdef CONFIG_PPC_BOOK3S_64 + if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) + return pseries_alloc_bootmem_huge_page(h); #endif - -unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - /* With radix we don't use slice, so derive it from vma*/ - if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { - unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - - return 1UL << mmu_psize_to_shift(psize); - } - return vma_kernel_pagesize(vma); + return __alloc_bootmem_huge_page(h, nid); } bool __init arch_hugetlb_valid_size(unsigned long size) @@ -611,52 +180,19 @@ static int __init hugetlbpage_init(void) for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { unsigned shift; - unsigned pdshift; if (!mmu_psize_defs[psize].shift) continue; shift = mmu_psize_to_shift(psize); -#ifdef CONFIG_PPC_BOOK3S_64 - if (shift > PGDIR_SHIFT) - continue; - else if (shift > PUD_SHIFT) - pdshift = PGDIR_SHIFT; - else if (shift > PMD_SHIFT) - pdshift = PUD_SHIFT; - else - pdshift = PMD_SHIFT; -#else - if (shift < PUD_SHIFT) - pdshift = PMD_SHIFT; - else if (shift < PGDIR_SHIFT) - pdshift = PUD_SHIFT; - else - pdshift = PGDIR_SHIFT; -#endif - if (add_huge_page_size(1ULL << shift) < 0) continue; - /* - * if we have pdshift and shift value same, we don't - * use pgt cache for hugepd. - */ - if (pdshift > shift) { - if (!IS_ENABLED(CONFIG_PPC_8xx)) - pgtable_cache_add(pdshift - shift); - } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || - IS_ENABLED(CONFIG_PPC_8xx)) { - pgtable_cache_add(PTE_T_ORDER); - } configured = true; } - if (configured) { - if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) - hugetlbpage_init_default(); - } else + if (!configured) pr_info("Failed to initialize. Disabling HugeTLB"); return 0; @@ -676,8 +212,6 @@ void __init gigantic_hugetlb_cma_reserve(void) */ order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; - if (order) { - VM_WARN_ON(order < MAX_ORDER); + if (order) hugetlb_cma_reserve(order); - } } diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 3a82f89827a5..745097554bea 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -20,6 +20,7 @@ #include <linux/pgtable.h> #include <asm/pgalloc.h> #include <asm/kup.h> +#include <asm/smp.h> phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull; EXPORT_SYMBOL_GPL(memstart_addr); @@ -30,9 +31,16 @@ EXPORT_SYMBOL_GPL(kernstart_virt_addr); bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); +#ifdef CONFIG_KFENCE +bool __ro_after_init kfence_disabled; +bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; +#endif static int __init parse_nosmep(char *p) { + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return 0; + disable_kuep = true; pr_warn("Disabling Kernel Userspace Execution Prevention\n"); return 0; @@ -47,9 +55,26 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); +void __weak setup_kuep(bool disabled) +{ + if (!IS_ENABLED(CONFIG_PPC_KUEP) || disabled) + return; + + if (smp_processor_id() != boot_cpuid) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); +} + +void setup_kup(void) +{ + setup_kuap(disable_kuap); + setup_kuep(disable_kuep); +} + #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ - memset(addr, 0, sizeof(void *) << (shift)); \ + memset(addr, 0, sizeof(pgd_t) << (shift)); \ } CTOR(0); CTOR(1); CTOR(2); CTOR(3); CTOR(4); CTOR(5); CTOR(6); CTOR(7); @@ -93,19 +118,15 @@ EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */ void pgtable_cache_add(unsigned int shift) { char *name; - unsigned long table_size = sizeof(void *) << shift; + unsigned long table_size = sizeof(pgd_t) << shift; unsigned long align = table_size; /* When batching pgtable pointers for RCU freeing, we store * the index size in the low bits. Table alignment must be * big enough to fit it. - * - * Likewise, hugeapge pagetable pointers contain a (different) - * shift value in the low bits. All tables must be aligned so - * as to leave enough 0 bits in the address to contain it. */ - unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, - HUGEPD_SHIFT_MASK + 1); - struct kmem_cache *new; + */ + unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; + struct kmem_cache *new = NULL; /* It would be nice if this was a BUILD_BUG_ON(), but at the * moment, gcc doesn't seem to recognize is_power_of_2 as a @@ -118,7 +139,8 @@ void pgtable_cache_add(unsigned int shift) align = max_t(unsigned long, align, minalign); name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); - new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); + if (name) + new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); if (!new) panic("Could not allocate pgtable cache for order %d", shift); diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3d690be48e84..4e71dfe7d026 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -29,7 +29,6 @@ #include <linux/slab.h> #include <linux/hugetlb.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/mmu.h> #include <asm/smp.h> @@ -40,6 +39,7 @@ #include <asm/hugetlb.h> #include <asm/kup.h> #include <asm/kasan.h> +#include <asm/fixmap.h> #include <mm/mmu_decl.h> @@ -70,44 +70,10 @@ EXPORT_SYMBOL(agp_special_page); void MMU_init(void); -/* - * this tells the system to map all of ram with the segregs - * (i.e. page tables) instead of the bats. - * -- Cort - */ -int __map_without_bats; -int __map_without_ltlbs; - /* max amount of low RAM to map in */ unsigned long __max_low_memory = MAX_LOW_MEM; /* - * Check for command-line options that affect what MMU_init will do. - */ -static void __init MMU_setup(void) -{ - /* Check for nobats option (used in mapin_ram). */ - if (strstr(boot_command_line, "nobats")) { - __map_without_bats = 1; - } - - if (strstr(boot_command_line, "noltlbs")) { - __map_without_ltlbs = 1; - } - if (IS_ENABLED(CONFIG_PPC_8xx)) - return; - - if (IS_ENABLED(CONFIG_KFENCE)) - __map_without_ltlbs = 1; - - if (debug_pagealloc_enabled()) - __map_without_ltlbs = 1; - - if (strict_kernel_rwx_enabled()) - __map_without_ltlbs = 1; -} - -/* * MMU_init sets up the basic memory mappings for the kernel, * including both RAM and possibly some I/O regions, * and sets up the page tables and the MMU hardware ready to go. @@ -117,31 +83,15 @@ void __init MMU_init(void) if (ppc_md.progress) ppc_md.progress("MMU:enter", 0x111); - /* parse args from command line */ - MMU_setup(); - - /* - * Reserve gigantic pages for hugetlb. This MUST occur before - * lowmem_end_addr is initialized below. - */ - if (memblock.memory.cnt > 1) { -#ifndef CONFIG_WII - memblock_enforce_memory_limit(memblock.memory.regions[0].size); - pr_warn("Only using first contiguous memory region\n"); -#else - wii_memory_fixups(); -#endif - } - total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr; lowmem_end_addr = memstart_addr + total_lowmem; -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB * entries, so we need to adjust lowmem to match the amount we can map * in the fixed entries */ adjust_total_lowmem(); -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ if (total_lowmem > __max_low_memory) { total_lowmem = __max_low_memory; @@ -177,6 +127,8 @@ void __init MMU_init(void) setup_kup(); + update_mmu_feature_fixups(MMU_FTR_KUAP); + /* Shortly after that, the entire linear mapping will be available */ memblock_set_current_limit(lowmem_end_addr); } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 386be136026e..b6f3ae03ca9e 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -40,6 +40,8 @@ #include <linux/of_fdt.h> #include <linux/libfdt.h> #include <linux/memremap.h> +#include <linux/memory.h> +#include <linux/bootmem_info.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -59,6 +61,7 @@ #include <asm/sections.h> #include <asm/iommu.h> #include <asm/vdso.h> +#include <asm/hugetlb.h> #include <mm/mmu_decl.h> @@ -91,7 +94,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad * a page table lookup here because with the hash translation we don't keep * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { struct page *start; unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; @@ -110,7 +113,7 @@ static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_m } /* - * vmemmap virtual address space management does not have a traditonal page + * vmemmap virtual address space management does not have a traditional page * table to track which virtual struct pages are backed by physical mapping. * The virtual to physical mappings are tracked in a simple linked list * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at @@ -127,7 +130,7 @@ static struct vmemmap_backing *next; /* * The same pointer 'next' tracks individual chunks inside the allocated - * full page during the boot time and again tracks the freeed nodes during + * full page during the boot time and again tracks the freed nodes during * runtime. It is racy but it does not happen as they are separated by the * boot process. Will create problem if some how we have memory hotplug * operation during boot !! @@ -182,13 +185,13 @@ static __meminit int vmemmap_list_populate(unsigned long phys, return 0; } -static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, - unsigned long page_size) +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) { unsigned long nr_pfn = page_size / sizeof(struct page); unsigned long start_pfn = page_to_pfn((struct page *)start); - if ((start_pfn + nr_pfn) > altmap->end_pfn) + if ((start_pfn + nr_pfn - 1) > altmap->end_pfn) return true; if (start_pfn < altmap->base_pfn) @@ -197,8 +200,8 @@ static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long star return false; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) +static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; @@ -271,6 +274,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + + return __vmemmap_populate(start, end, node, altmap); +} + #ifdef CONFIG_MEMORY_HOTPLUG static unsigned long vmemmap_list_free(unsigned long start) { @@ -302,8 +317,8 @@ static unsigned long vmemmap_list_free(unsigned long start) return vmem_back->phys; } -void __ref vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void __ref __vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; unsigned long page_order = get_order(page_size); @@ -313,8 +328,7 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, start = ALIGN_DOWN(start, page_size); if (altmap) { alt_start = altmap->base_pfn; - alt_end = altmap->base_pfn + altmap->reserve + - altmap->free + altmap->alloc + altmap->align; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; } pr_debug("vmemmap_free %lx...%lx\n", start, end); @@ -361,15 +375,35 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, vmemmap_remove_mapping(start, page_size); } } + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_free(start, end, altmap); +#endif + return __vmemmap_free(start, end, altmap); +} + #endif + +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long size) { } +#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 +unsigned int mmu_lpid_bits; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +EXPORT_SYMBOL_GPL(mmu_lpid_bits); +#endif +unsigned int mmu_pid_bits; + static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); static int __init parse_disable_radix(char *p) @@ -437,11 +471,180 @@ static void __init early_check_vec5(void) } } +static int __init dt_scan_mmu_pid_width(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + /* Find MMU LPID, PID register size */ + prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size); + if (prop && size == 4) + mmu_lpid_bits = be32_to_cpup(prop); + + prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); + if (prop && size == 4) + mmu_pid_bits = be32_to_cpup(prop); + + if (!mmu_pid_bits && !mmu_lpid_bits) + return 0; + + return 1; +} + +/* + * Outside hotplug the kernel uses this value to map the kernel direct map + * with radix. To be compatible with older kernels, let's keep this value + * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map + * things with 1GB size in the case where we don't support hotplug. + */ +#ifndef CONFIG_MEMORY_HOTPLUG +#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M +#else +#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE +#endif + +static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size) +{ + unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE; + + for (; *block_size > min_memory_block_size; *block_size >>= 2) { + if ((mem_size & *block_size) == 0) + break; + } +} + +static int __init probe_memory_block_size(unsigned long node, const char *uname, int + depth, void *data) +{ + const char *type; + unsigned long *block_size = (unsigned long *)data; + const __be32 *reg, *endp; + int l; + + if (depth != 1) + return 0; + /* + * If we have dynamic-reconfiguration-memory node, use the + * lmb value. + */ + if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) { + + const __be32 *prop; + + prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l); + + if (!prop || l < dt_root_size_cells * sizeof(__be32)) + /* + * Nothing in the device tree + */ + *block_size = DEFAULT_MEMORY_BLOCK_SIZE; + else + *block_size = of_read_number(prop, dt_root_size_cells); + /* + * We have found the final value. Don't probe further. + */ + return 1; + } + /* + * Find all the device tree nodes of memory type and make sure + * the area can be mapped using the memory block size value + * we end up using. We start with 1G value and keep reducing + * it such that we can map the entire area using memory_block_size. + * This will be used on powernv and older pseries that don't + * have ibm,lmb-size node. + * For ex: with P5 we can end up with + * memory@0 -> 128MB + * memory@128M -> 64M + * This will end up using 64MB memory block size value. + */ + type = of_get_flat_dt_prop(node, "device_type", NULL); + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l); + if (!reg) + reg = of_get_flat_dt_prop(node, "reg", &l); + if (!reg) + return 0; + + endp = reg + (l / sizeof(__be32)); + while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { + const char *compatible; + u64 size; + + dt_mem_next_cell(dt_root_addr_cells, ®); + size = dt_mem_next_cell(dt_root_size_cells, ®); + + if (size) { + update_memory_block_size(block_size, size); + continue; + } + /* + * ibm,coherent-device-memory with linux,usable-memory = 0 + * Force 256MiB block size. Work around for GPUs on P9 PowerNV + * linux,usable-memory == 0 implies driver managed memory and + * we can't use large memory block size due to hotplug/unplug + * limitations. + */ + compatible = of_get_flat_dt_prop(node, "compatible", NULL); + if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) { + if (*block_size > SZ_256M) + *block_size = SZ_256M; + /* + * We keep 256M as the upper limit with GPU present. + */ + return 0; + } + } + /* continue looking for other memory device types */ + return 0; +} + +/* + * start with 1G memory block size. Early init will + * fix this with correct value. + */ +unsigned long memory_block_size __ro_after_init = 1UL << 30; +static void __init early_init_memory_block_size(void) +{ + /* + * We need to do memory_block_size probe early so that + * radix__early_init_mmu() can use this as limit for + * mapping page size. + */ + of_scan_flat_dt(probe_memory_block_size, &memory_block_size); +} + void __init mmu_early_init_devtree(void) { + bool hvmode = !!(mfmsr() & MSR_HV); + /* Disable radix mode based on kernel command line. */ - if (disable_radix) - cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + if (disable_radix) { + if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + else + pr_warn("WARNING: Ignoring cmdline option disable_radix\n"); + } + + of_scan_flat_dt(dt_scan_mmu_pid_width, NULL); + if (hvmode && !mmu_lpid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + mmu_lpid_bits = 12; /* POWER8-10 */ + else + mmu_lpid_bits = 10; /* POWER7 */ + } + if (!mmu_pid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + mmu_pid_bits = 20; /* POWER9-10 */ + } /* * Check /chosen/ibm,architecture-vec-5 if running as a guest. @@ -449,11 +652,14 @@ void __init mmu_early_init_devtree(void) * even though the ibm,architecture-vec-5 property created by * skiboot doesn't have the necessary bits set. */ - if (!(mfmsr() & MSR_HV)) + if (!hvmode) early_check_vec5(); + early_init_memory_block_size(); + if (early_radix_enabled()) { radix__early_init_devtree(); + /* * We have finalized the translation we are going to use by now. * Radix mode is not limited by RMA / VRMA addressing. @@ -463,5 +669,12 @@ void __init mmu_early_init_devtree(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } else hash__early_init_devtree(); + + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) + hugetlbpage_init_defaultsize(); + + if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) && + !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX)) + panic("kernel does not support any MMU type offered by platform"); } #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 57342154d2b0..4b4feba9873b 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -4,7 +4,6 @@ #include <linux/slab.h> #include <linux/mmzone.h> #include <linux/vmalloc.h> -#include <asm/io-workarounds.h> unsigned long ioremap_bot; EXPORT_SYMBOL(ioremap_bot); @@ -14,8 +13,6 @@ void __iomem *ioremap(phys_addr_t addr, unsigned long size) pgprot_t prot = pgprot_noncached(PAGE_KERNEL); void *caller = __builtin_return_address(0); - if (iowa_is_active()) - return iowa_ioremap(addr, size, prot, caller); return __ioremap_caller(addr, size, prot, caller); } EXPORT_SYMBOL(ioremap); @@ -25,8 +22,6 @@ void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size) pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); void *caller = __builtin_return_address(0); - if (iowa_is_active()) - return iowa_ioremap(addr, size, prot, caller); return __ioremap_caller(addr, size, prot, caller); } EXPORT_SYMBOL(ioremap_wc); @@ -36,26 +31,18 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) pgprot_t prot = pgprot_cached(PAGE_KERNEL); void *caller = __builtin_return_address(0); - if (iowa_is_active()) - return iowa_ioremap(addr, size, prot, caller); return __ioremap_caller(addr, size, prot, caller); } -void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, pgprot_t prot) { - pte_t pte = __pte(flags); + pte_t pte = __pte(pgprot_val(prot)); void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ if (pte_write(pte)) pte = pte_mkdirty(pte); - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - pte = pte_mkprivileged(pte); - - if (iowa_is_active()) - return iowa_ioremap(addr, size, pte_pgprot(pte), caller); return __ioremap_caller(addr, size, pte_pgprot(pte), caller); } EXPORT_SYMBOL(ioremap_prot); @@ -66,7 +53,7 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long i; for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); + int err = map_kernel_page(ea + i, pa + i, pgprot_nx(prot)); if (WARN_ON_ONCE(err)) /* Should clean up */ return err; @@ -74,47 +61,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, return 0; } - -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller) -{ - struct vm_struct *area; - int ret; - unsigned long va; - - area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); - if (area == NULL) - return NULL; - - area->phys_addr = pa; - va = (unsigned long)area->addr; - - ret = ioremap_page_range(va, va + size, pa, prot); - if (!ret) - return (void __iomem *)area->addr + offset; - - vunmap_range(va, va + size); - free_vm_area(area); - - return NULL; -} - -#ifdef CONFIG_ZONE_DEVICE -/* - * Override the generic version in mm/memremap.c. - * - * With hash translation, the direct-map range is mapped with just one - * page size selected by htab_init_page_sizes(). Consult - * mmu_psize_defs[] to determine the minimum page size alignment. -*/ -unsigned long memremap_compat_align(void) -{ - unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; - - if (radix_enabled()) - return SUBSECTION_SIZE; - return max(SUBSECTION_SIZE, 1UL << shift); - -} -EXPORT_SYMBOL_GPL(memremap_compat_align); -#endif diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index 9d13143b8be4..ca5bc6be3e6f 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -22,6 +22,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call int err; /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (addr < SZ_16M) + addr += _ISA_MEM_BASE; + + /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. * Before then, we use space going down from IOREMAP_TOP @@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call offset = addr & ~PAGE_MASK; size = PAGE_ALIGN(addr + size) - p; - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16 * 1024 * 1024) - p += _ISA_MEM_BASE; - #ifndef CONFIG_CRASH_DUMP /* * Don't allow anybody to remap normal RAM that we're using. @@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call return (void __iomem *)v + offset; if (slab_is_available()) - return do_ioremap(p, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); /* * Should check if it is a candidate for a BAT mapping @@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr) if (v_block_mapped((unsigned long)addr)) return; - if (addr > high_memory && (unsigned long)addr < ioremap_bot) - vunmap((void *)(PAGE_MASK & (unsigned long)addr)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 3acece00b33e..fb8b55bd2cd5 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; if (slab_is_available()) - return do_ioremap(paligned, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); @@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, */ void iounmap(volatile void __iomem *token) { - void *addr; - if (!slab_is_available()) return; - addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); - - if ((unsigned long)addr < ioremap_bot) { - pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); - return; - } - vunmap(addr); + generic_iounmap(token); } EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c index 2784224054f8..989d6cdf4141 100644 --- a/arch/powerpc/mm/kasan/8xx.c +++ b/arch/powerpc/mm/kasan/8xx.c @@ -6,28 +6,33 @@ #include <linux/memblock.h> #include <linux/hugetlb.h> +#include <asm/pgalloc.h> + static int __init kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block) { pmd_t *pmd = pmd_off_k(k_start); unsigned long k_cur, k_next; - for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block += SZ_8M) { - pte_basic_t *new; + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++, block += SZ_4M) { + pte_t *ptep; + int i; k_next = pgd_addr_end(k_cur, k_end); - k_next = pgd_addr_end(k_next, k_end); if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) continue; - new = memblock_alloc(sizeof(pte_basic_t), SZ_4K); - if (!new) + ptep = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + if (!ptep) return -ENOMEM; - *new = pte_val(pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block)), PAGE_KERNEL))); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block + i * PAGE_SIZE)), PAGE_KERNEL)); - hugepd_populate_kernel((hugepd_t *)pmd, (pte_t *)new, PAGE_SHIFT_8M); - hugepd_populate_kernel((hugepd_t *)pmd + 1, (pte_t *)new, PAGE_SHIFT_8M); + __set_pte_at(&init_mm, k_cur, ptep + i, pte, 1); + } + pmd_populate_kernel(&init_mm, pmd, ptep); + *pmd = __pmd(pmd_val(*pmd) | _PMD_PAGE_8M); } return 0; } diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile index bb1a5408b86b..f9522fd70b2f 100644 --- a/arch/powerpc/mm/kasan/Makefile +++ b/arch/powerpc/mm/kasan/Makefile @@ -1,7 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 KASAN_SANITIZE := n +KCOV_INSTRUMENT := n -obj-$(CONFIG_PPC32) += kasan_init_32.o +obj-$(CONFIG_PPC32) += init_32.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o +obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += init_book3e_64.o diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c index 202bd260a009..450a67ef0bbe 100644 --- a/arch/powerpc/mm/kasan/book3s_32.c +++ b/arch/powerpc/mm/kasan/book3s_32.c @@ -10,47 +10,51 @@ int __init kasan_init_region(void *start, size_t size) { unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); - unsigned long k_cur = k_start; - int k_size = k_end - k_start; - int k_size_base = 1 << (ffs(k_size) - 1); + unsigned long k_nobat = k_start; + unsigned long k_cur; + phys_addr_t phys; int ret; - void *block; - block = memblock_alloc(k_size, k_size_base); - - if (block && k_size_base >= SZ_128K && k_start == ALIGN(k_start, k_size_base)) { - int k_size_more = 1 << (ffs(k_size - k_size_base) - 1); - - setbat(-1, k_start, __pa(block), k_size_base, PAGE_KERNEL); - if (k_size_more >= SZ_128K) - setbat(-1, k_start + k_size_base, __pa(block) + k_size_base, - k_size_more, PAGE_KERNEL); - if (v_block_mapped(k_start)) - k_cur = k_start + k_size_base; - if (v_block_mapped(k_start + k_size_base)) - k_cur = k_start + k_size_base + k_size_more; - - update_bats(); + while (k_nobat < k_end) { + unsigned int k_size = bat_block_size(k_nobat, k_end); + int idx = find_free_bat(); + + if (idx == -1) + break; + if (k_size < SZ_128K) + break; + phys = memblock_phys_alloc_range(k_size, k_size, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + break; + + setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL); + k_nobat += k_size; } + if (k_nobat != k_start) + update_bats(); - if (!block) - block = memblock_alloc(k_size, PAGE_SIZE); - if (!block) - return -ENOMEM; + if (k_nobat < k_end) { + phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + return -ENOMEM; + } ret = kasan_init_shadow_page_tables(k_start, k_end); if (ret) return ret; - kasan_update_early_region(k_start, k_cur, __pte(0)); + kasan_update_early_region(k_start, k_nobat, __pte(0)); - for (; k_cur < k_end; k_cur += PAGE_SIZE) { + for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_off_k(k_cur); - void *va = block + k_cur - k_start; - pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL); __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); } flush_tlb_kernel_range(k_start, k_end); + memset(kasan_mem_to_shadow(start), 0, k_end - k_start); + return 0; } diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/init_32.c index cf8770b1a692..1d083597464f 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -7,7 +7,7 @@ #include <linux/memblock.h> #include <linux/sched/task.h> #include <asm/pgalloc.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <mm/mmu_decl.h> static pgprot_t __init kasan_prot_ro(void) @@ -25,7 +25,7 @@ static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot) int i; for (i = 0; i < PTRS_PER_PTE; i++, ptep++) - __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 1); } int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) @@ -64,6 +64,7 @@ int __init __weak kasan_init_region(void *start, size_t size) if (ret) return ret; + k_start = k_start & PAGE_MASK; block = memblock_alloc(k_end - k_start, PAGE_SIZE); if (!block) return -ENOMEM; @@ -83,13 +84,12 @@ void __init kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte) { unsigned long k_cur; - phys_addr_t pa = __pa(kasan_early_shadow_page); for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_off_k(k_cur); pte_t *ptep = pte_offset_kernel(pmd, k_cur); - if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) + if (pte_page(*ptep) != virt_to_page(lm_alias(kasan_early_shadow_page))) continue; __set_pte_at(&init_mm, k_cur, ptep, pte, 0); @@ -165,7 +165,7 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_late_init(void) diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c new file mode 100644 index 000000000000..0d3a73d6d4b0 --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3e powerpc + * + * Copyright 2022, Christophe Leroy, CS GROUP France + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/set_memory.h> + +#include <asm/pgalloc.h> + +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); +} + +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); +} + +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); +} + +static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + if (kasan_pud_table(*p4dp)) { + pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); + } + pudp = pud_offset(p4dp, ea); + if (kasan_pmd_table(*pudp)) { + pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (kasan_pte_table(*pmdp)) { + ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + + __set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0); + + return 0; +} + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_early_init(void) +{ + int i; + unsigned long addr; + pgd_t *pgd = pgd_offset_k(KASAN_SHADOW_START); + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + for (addr = KASAN_SHADOW_START; addr != KASAN_SHADOW_END; addr += PGDIR_SIZE) + p4d_populate(&init_mm, p4d_offset(pgd++, addr), kasan_early_shadow_pud); +} + +void __init kasan_init(void) +{ + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end)); + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); + + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + /* Enable error messages */ + init_task.kasan_depth = 0; + kasan_init_generic(); +} + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c new file mode 100644 index 000000000000..dcafa641804c --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3S powerpc + * + * Copyright 2019-2022, Daniel Axtens, IBM Corporation. + */ + +/* + * ppc64 turns on virtual memory late in boot, after calling into generic code + * like the device-tree parser, so it uses this in conjunction with a hook in + * outline mode to avoid invalid access early in boot. + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/sched/task.h> +#include <linux/memblock.h> +#include <asm/pgalloc.h> + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_init(void) +{ + /* + * We want to do the following things: + * 1) Map real memory into the shadow for all physical memblocks + * This takes us from c000... to c008... + * 2) Leave a hole over the shadow of vmalloc space. KASAN_VMALLOC + * will manage this for us. + * This takes us from c008... to c00a... + * 3) Map the 'early shadow'/zero page over iomap and vmemmap space. + * This takes us up to where we start at c00e... + */ + + void *k_start = kasan_mem_to_shadow((void *)RADIX_VMALLOC_END); + void *k_end = kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END); + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + if (!early_radix_enabled()) { + pr_warn("KASAN not enabled as it requires radix!"); + return; + } + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end)); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + /* map the early shadow over the iomap and vmemmap space */ + kasan_populate_early_shadow(k_start, k_end); + + /* mark early shadow region as RO and wipe it */ + zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + /* + * clear_page relies on some cache info that hasn't been set up yet. + * It ends up looping ~forever and blows up other data. + * Use memset instead. + */ + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + /* Enable error messages */ + init_task.kasan_depth = 0; + kasan_init_generic(); +} + +void __init kasan_early_init(void) { } + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c index aad7c47e0030..ea821d0ffe16 100644 --- a/arch/powerpc/mm/maccess.c +++ b/arch/powerpc/mm/maccess.c @@ -11,20 +11,3 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return is_kernel_addr((unsigned long)unsafe_src); } - -int copy_inst_from_kernel_nofault(struct ppc_inst *inst, u32 *src) -{ - unsigned int val, suffix; - int err; - - err = copy_from_kernel_nofault(&val, src, sizeof(val)); - if (err) - return err; - if (IS_ENABLED(CONFIG_PPC64) && get_op(val) == OP_PREFIX) { - err = copy_from_kernel_nofault(&suffix, src + 1, sizeof(suffix)); - *inst = ppc_inst_prefix(val, suffix); - } else { - *inst = ppc_inst(val); - } - return err; -} diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ad198b439222..3ddbfdbfa941 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -16,33 +16,39 @@ #include <linux/highmem.h> #include <linux/suspend.h> #include <linux/dma-direct.h> +#include <linux/execmem.h> +#include <linux/vmalloc.h> +#include <asm/swiotlb.h> #include <asm/machdep.h> #include <asm/rtas.h> #include <asm/kasan.h> -#include <asm/sparsemem.h> #include <asm/svm.h> +#include <asm/mmzone.h> +#include <asm/ftrace.h> +#include <asm/text-patching.h> +#include <asm/setup.h> +#include <asm/fixmap.h> #include <mm/mmu_decl.h> -unsigned long long memory_limit; -bool init_mem_is_free; +unsigned long long memory_limit __initdata; unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t vma_prot) +pgprot_t __phys_mem_access_prot(unsigned long pfn, unsigned long size, + pgprot_t vma_prot) { if (ppc_md.phys_mem_access_prot) - return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot); + return ppc_md.phys_mem_access_prot(pfn, size, vma_prot); if (!page_is_ram(pfn)) vma_prot = pgprot_noncached(vma_prot); return vma_prot; } -EXPORT_SYMBOL(phys_mem_access_prot); +EXPORT_SYMBOL(__phys_mem_access_prot); #ifdef CONFIG_MEMORY_HOTPLUG static DEFINE_MUTEX(linear_mapping_mutex); @@ -52,6 +58,7 @@ int memory_add_physaddr_to_nid(u64 start) { return hot_add_scn_to_nid(start); } +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif int __weak create_section_mapping(unsigned long start, unsigned long end, @@ -103,6 +110,37 @@ void __ref arch_remove_linear_mapping(u64 start, u64 size) vm_unmap_aliases(); } +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +int __ref add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct mhp_params *params) +{ + int ret; + + ret = __add_pages(nid, start_pfn, nr_pages, params); + if (ret) + return ret; + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); + + return ret; +} + int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) { @@ -113,14 +151,13 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, rc = arch_create_linear_mapping(nid, start, size, params); if (rc) return rc; - rc = __add_pages(nid, start_pfn, nr_pages, params); + rc = add_pages(nid, start_pfn, nr_pages, params); if (rc) arch_remove_linear_mapping(start, size); return rc; } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -179,7 +216,7 @@ static int __init mark_nonram_nosave(void) * everything else. GFP_DMA32 page allocations automatically fall back to * ZONE_DMA. * - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by * ZONE_DMA. @@ -193,6 +230,7 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); + int zone_dma_bits; #ifdef CONFIG_HIGHMEM unsigned long v = __fix_to_virt(FIX_KMAP_END); @@ -219,6 +257,8 @@ void __init paging_init(void) else zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); + #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(max_low_pfn, 1UL << (zone_dma_bits - PAGE_SHIFT)); @@ -233,7 +273,7 @@ void __init paging_init(void) mark_nonram_nosave(); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { /* * book3s is limited to 16 page sizes due to encoding this in @@ -250,34 +290,12 @@ void __init mem_init(void) * back to to-down. */ memblock_set_bottom_up(true); - if (is_secure_guest()) - svm_swiotlb_init(); - else - swiotlb_init(0); + swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags); #endif - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - set_max_mapnr(max_pfn); - kasan_late_init(); - memblock_free_all(); - -#ifdef CONFIG_HIGHMEM - { - unsigned long pfn, highmem_mapnr; - - highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT; - for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { - phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - if (!memblock_is_reserved(paddr)) - free_highmem_page(page); - } - } -#endif /* CONFIG_HIGHMEM */ - -#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP) +#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP) /* * If smp is enabled, next_tlbcam_idx is initialized in the cpu up * functions.... do it here for the non-smp case. @@ -285,36 +303,14 @@ void __init mem_init(void) per_cpu(next_tlbcam_idx, smp_processor_id()) = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; #endif - -#ifdef CONFIG_PPC32 - pr_info("Kernel virtual memory layout:\n"); -#ifdef CONFIG_KASAN - pr_info(" * 0x%08lx..0x%08lx : kasan shadow mem\n", - KASAN_SHADOW_START, KASAN_SHADOW_END); -#endif - pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); -#ifdef CONFIG_HIGHMEM - pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", - PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); -#endif /* CONFIG_HIGHMEM */ - if (ioremap_bot != IOREMAP_TOP) - pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", - ioremap_bot, IOREMAP_TOP); - pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", - VMALLOC_START, VMALLOC_END); -#ifdef MODULES_VADDR - pr_info(" * 0x%08lx..0x%08lx : modules\n", - MODULES_VADDR, MODULES_END); -#endif -#endif /* CONFIG_PPC32 */ } void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); - init_mem_is_free = true; free_initmem_default(POISON_FREE_INITMEM); + ftrace_free_init_tramp(); } /* @@ -342,7 +338,7 @@ static int __init add_system_ram_resources(void) */ res->end = end - 1; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - WARN_ON(request_resource(&iomem_resource, res) < 0); + WARN_ON(insert_resource(&iomem_resource, res) < 0); } } @@ -375,3 +371,80 @@ int devmem_is_allowed(unsigned long pfn) * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed. */ EXPORT_SYMBOL_GPL(walk_system_ram_range); + +#ifdef CONFIG_EXECMEM +static struct execmem_info execmem_info __ro_after_init; + +#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_BOOK3S_603) +static void prealloc_execmem_pgtable(void) +{ + unsigned long va; + + for (va = ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE); va < MODULES_END; va += PGDIR_SIZE) + pte_alloc_kernel(pmd_off_k(va), va); +} +#else +static void prealloc_execmem_pgtable(void) { } +#endif + +struct execmem_info __init *execmem_arch_setup(void) +{ + pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; + unsigned long fallback_start = 0, fallback_end = 0; + unsigned long start, end; + + /* + * BOOK3S_32 and 8xx define MODULES_VADDR for text allocations and + * allow allocating data in the entire vmalloc space + */ +#ifdef MODULES_VADDR + unsigned long limit = (unsigned long)_etext - SZ_32M; + + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); + + /* First try within 32M limit from _etext to avoid branch trampolines */ + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) { + start = limit; + fallback_start = MODULES_VADDR; + fallback_end = MODULES_END; + } else { + start = MODULES_VADDR; + } + + end = MODULES_END; +#else + start = VMALLOC_START; + end = VMALLOC_END; +#endif + + prealloc_execmem_pgtable(); + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = start, + .end = end, + .pgprot = prot, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end = VMALLOC_END, + .pgprot = kprobes_prot, + .alignment = 1, + }, + [EXECMEM_MODULE_DATA] = { + .start = VMALLOC_START, + .end = VMALLOC_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + }, + }; + + return &execmem_info; +} +#endif /* CONFIG_EXECMEM */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c deleted file mode 100644 index ae683fdc716c..000000000000 --- a/arch/powerpc/mm/mmap.c +++ /dev/null @@ -1,228 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * flexible mmap layout support - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * Started by Ingo Molnar <mingo@elte.hu> - */ - -#include <linux/personality.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/sched/signal.h> -#include <linux/sched/mm.h> -#include <linux/elf-randomize.h> -#include <linux/security.h> -#include <linux/mman.h> - -/* - * Top of mmap area (just below the process stack). - * - * Leave at least a ~128 MB hole. - */ -#define MIN_GAP (128*1024*1024) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -unsigned long arch_mmap_rnd(void) -{ - unsigned long shift, rnd; - - shift = mmap_rnd_bits; -#ifdef CONFIG_COMPAT - if (is_32bit_task()) - shift = mmap_rnd_compat_bits; -#endif - rnd = get_random_long() % (1ul << shift); - - return rnd << PAGE_SHIFT; -} - -static inline unsigned long stack_maxrandom_size(void) -{ - if (!(current->flags & PF_RANDOMIZE)) - return 0; - - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - return (1<<23); - else - return (1<<30); -} - -static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_maxrandom_size() + stack_guard_gap; - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); -} - -#ifdef CONFIG_PPC_RADIX_MMU -/* - * Same function as generic code used only for radix, because we don't need to overload - * the generic one. But we will have to duplicate, because hash select - * HAVE_ARCH_UNMAPPED_AREA - */ -static unsigned long -radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - return addr; - } - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = 0; - info.length = len; - info.low_limit = mm->mmap_base; - info.high_limit = high_limit; - info.align_mask = 0; - - return vm_unmapped_area(&info); -} - -static unsigned long -radix__arch_get_unmapped_area_topdown(struct file *filp, - const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - unsigned long addr = addr0; - int fixed = (flags & MAP_FIXED); - unsigned long high_limit; - struct vm_unmapped_area_info info; - - high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit || (fixed && (addr + len > high_limit))) - high_limit = TASK_SIZE; - - if (len > high_limit) - return -ENOMEM; - - if (fixed) { - if (addr > high_limit - len) - return -ENOMEM; - return addr; - } - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (high_limit - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); - info.align_mask = 0; - - addr = vm_unmapped_area(&info); - if (!(addr & ~PAGE_MASK)) - return addr; - VM_BUG_ON(addr != -ENOMEM); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags); -} - -static void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor, - struct rlimit *rlim_stack) -{ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = radix__arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; - } -} -#else -/* dummy */ -extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor, - struct rlimit *rlim_stack); -#endif -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (radix_enabled()) - return radix__arch_pick_mmap_layout(mm, random_factor, - rlim_stack); - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 74246536b832..3e3af29b4523 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -18,6 +18,12 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, { /* 32-bit keeps track of the current PGDIR in the thread struct */ tsk->thread.pgdir = mm->pgd; +#ifdef CONFIG_PPC_BOOK3S_32 + tsk->thread.sr0 = mm->context.sr0; +#endif +#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = mm->context.id; +#endif } #elif defined(CONFIG_PPC_BOOK3E_64) static inline void switch_mm_pgdir(struct task_struct *tsk, @@ -25,6 +31,9 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, { /* 64-bit Book3E keeps track of current PGD in the PACA */ get_paca()->pgd = mm->pgd; +#ifdef CONFIG_PPC_KUAP + tsk->thread.pid = mm->context.id; +#endif } #else static inline void switch_mm_pgdir(struct task_struct *tsk, @@ -34,11 +43,13 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { + int cpu = smp_processor_id(); bool new_on_cpu = false; /* Mark this context has been used on the new CPU */ - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { - cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + VM_WARN_ON_ONCE(next == &init_mm); + cpumask_set_cpu(cpu, mm_cpumask(next)); inc_mm_active_cpus(next); /* @@ -81,7 +92,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * context */ if (cpu_has_feature(CPU_FTR_ALTIVEC)) - asm volatile ("dssall"); + asm volatile (PPC_DSSALL); if (!new_on_cpu) membarrier_arch_switch_mm(prev, next, tsk); @@ -91,6 +102,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * sub architectures. Out of line for now */ switch_mmu_context(prev, next, tsk); + + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(prev))); } #ifndef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7dac910c0b21..b2d1eea09761 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -20,9 +20,9 @@ #include <asm/trace.h> /* - * On 40x and 8xx, we directly inline tlbia and tlbivax + * On 8xx, we directly inline tlbia */ -#if defined(CONFIG_40x) || defined(CONFIG_PPC_8xx) +#ifdef CONFIG_PPC_8xx static inline void _tlbil_all(void) { asm volatile ("sync; tlbia; isync" : : : "memory"); @@ -35,15 +35,15 @@ static inline void _tlbil_pid(unsigned int pid) } #define _tlbil_pid_noind(pid) _tlbil_pid(pid) -#else /* CONFIG_40x || CONFIG_PPC_8xx */ +#else /* CONFIG_PPC_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 extern void _tlbil_pid_noind(unsigned int pid); #else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) #endif -#endif /* !(CONFIG_40x || CONFIG_PPC_8xx) */ +#endif /* !CONFIG_PPC_8xx */ /* * On 8xx, we directly inline tlbie, on others, it's extern @@ -55,7 +55,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); trace_tlbie(0, 0, address, pid, 0, 0, 0); } -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) extern void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, } #endif /* CONFIG_PPC_8xx */ -#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) +#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_47x) extern void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -92,29 +92,16 @@ extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); -extern int __map_without_bats; -extern unsigned int rtas_data, rtas_size; - -struct hash_pte; extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ extern unsigned long __max_low_memory; -extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; extern phys_addr_t total_lowmem; extern phys_addr_t memstart_addr; extern phys_addr_t lowmem_end_addr; -#ifdef CONFIG_WII -extern unsigned long wii_hole_start; -extern unsigned long wii_hole_size; - -extern unsigned long wii_mmu_mapin_mem2(unsigned long top); -extern void wii_memory_fixups(void); -#endif - /* ...and now those things that may be slightly different between processor * architectures. -- Dan */ @@ -123,18 +110,18 @@ extern void MMU_init_hw(void); void MMU_init_hw_patch(void); unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif +void mmu_init_secondary(int cpu); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, - bool dryrun); -extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys); + bool dryrun, bool init); #ifdef CONFIG_PPC32 extern void adjust_total_lowmem(void); extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); void reloc_kernel_entry(void *fdt, int addr); +void relocate_init(u64 dt_ptr, phys_addr_t start); extern int is_second_reloc; #endif extern void loadcam_entry(unsigned int index); @@ -155,11 +142,15 @@ struct tlbcam { u32 MAS3; u32 MAS7; }; + +#define NUM_TLBCAMS 64 + +extern struct tlbcam TLBCAM[NUM_TLBCAMS]; #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx) +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_8xx) /* 6xx have BATS */ -/* FSL_BOOKE have TLBCAM */ +/* PPC_85xx have TLBCAM */ /* 8xx have LTLB */ phys_addr_t v_block_mapped(unsigned long va); unsigned long p_block_mapped(phys_addr_t pa); @@ -168,25 +159,26 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif -#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) -void mmu_mark_initmem_nx(void); -void mmu_mark_rodata_ro(void); +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500) +int mmu_mark_initmem_nx(void); +int mmu_mark_rodata_ro(void); #else -static inline void mmu_mark_initmem_nx(void) { } -static inline void mmu_mark_rodata_ro(void) { } +static inline int mmu_mark_initmem_nx(void) { return 0; } +static inline int mmu_mark_rodata_ro(void) { return 0; } #endif #ifdef CONFIG_PPC_8xx void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_PPC_DEBUG_WX -void ptdump_check_wx(void); -#else -static inline void ptdump_check_wx(void) { } -#endif - static inline bool debug_pagealloc_enabled_or_kfence(void) { return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); } + +#ifdef CONFIG_MEMORY_HOTPLUG +int create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot); +#endif + +int hash__kernel_map_pages(struct page *page, int numpages, int enable); diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c deleted file mode 100644 index 95751c322f6c..000000000000 --- a/arch/powerpc/mm/nohash/40x.c +++ /dev/null @@ -1,152 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * This file contains the routines for initializing the MMU - * on the 4xx series of chips. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/stddef.h> -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/highmem.h> -#include <linux/memblock.h> - -#include <asm/prom.h> -#include <asm/io.h> -#include <asm/mmu_context.h> -#include <asm/mmu.h> -#include <linux/uaccess.h> -#include <asm/smp.h> -#include <asm/bootx.h> -#include <asm/machdep.h> -#include <asm/setup.h> - -#include <mm/mmu_decl.h> - -extern int __map_without_ltlbs; -/* - * MMU_init_hw does the chip-specific initialization of the MMU hardware. - */ -void __init MMU_init_hw(void) -{ - /* - * The Zone Protection Register (ZPR) defines how protection will - * be applied to every page which is a member of a given zone. At - * present, we utilize only two of the 4xx's zones. - * The zone index bits (of ZSEL) in the PTE are used for software - * indicators, except the LSB. For user access, zone 1 is used, - * for kernel access, zone 0 is used. We set all but zone 1 - * to zero, allowing only kernel access as indicated in the PTE. - * For zone 1, we set a 01 binary (a value of 10 will not work) - * to allow user access as indicated in the PTE. This also allows - * kernel access as indicated in the PTE. - */ - - mtspr(SPRN_ZPR, 0x10000000); - - flush_instruction_cache(); - - /* - * Set up the real-mode cache parameters for the exception vector - * handlers (which are run in real-mode). - */ - - mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */ - - /* - * Cache instruction and data space where the exception - * vectors and the kernel live in real-mode. - */ - - mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */ - mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */ -} - -#define LARGE_PAGE_SIZE_16M (1<<24) -#define LARGE_PAGE_SIZE_4M (1<<22) - -unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) -{ - unsigned long v, s, mapped; - phys_addr_t p; - - v = KERNELBASE; - p = 0; - s = total_lowmem; - - if (__map_without_ltlbs) - return 0; - - while (s >= LARGE_PAGE_SIZE_16M) { - pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_RW; - - pmdp = pmd_off_k(v); - *pmdp++ = __pmd(val); - *pmdp++ = __pmd(val); - *pmdp++ = __pmd(val); - *pmdp++ = __pmd(val); - - v += LARGE_PAGE_SIZE_16M; - p += LARGE_PAGE_SIZE_16M; - s -= LARGE_PAGE_SIZE_16M; - } - - while (s >= LARGE_PAGE_SIZE_4M) { - pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_RW; - - pmdp = pmd_off_k(v); - *pmdp = __pmd(val); - - v += LARGE_PAGE_SIZE_4M; - p += LARGE_PAGE_SIZE_4M; - s -= LARGE_PAGE_SIZE_4M; - } - - mapped = total_lowmem - s; - - /* If the size of RAM is not an exact power of two, we may not - * have covered RAM in its entirety with 16 and 4 MiB - * pages. Consequently, restrict the top end of RAM currently - * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" - * coverage with normal-sized pages (or other reasons) do not - * attempt to allocate outside the allowed range. - */ - memblock_set_current_limit(mapped); - - return mapped; -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* 40x can only access 16MB at the moment (see head_40x.S) */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); -} diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c index e079f26b267e..6d10c6d8be71 100644 --- a/arch/powerpc/mm/nohash/44x.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -24,7 +24,7 @@ #include <asm/mmu.h> #include <asm/page.h> #include <asm/cacheflush.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/smp.h> #include <mm/mmu_decl.h> @@ -38,7 +38,7 @@ int icache_44x_need_flush; unsigned long tlb_47x_boltmap[1024/8]; -static void ppc44x_update_tlb_hwater(void) +static void __init ppc44x_update_tlb_hwater(void) { /* The TLB miss handlers hard codes the watermark in a cmpli * instruction to improve performances rather than loading it @@ -122,7 +122,7 @@ static void __init ppc47x_update_boltmap(void) /* * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU */ -static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +static void __init ppc47x_pin_tlb(unsigned int virt, unsigned int phys) { unsigned int rA; int bolted; @@ -240,19 +240,3 @@ void __init mmu_init_secondary(int cpu) } } #endif /* CONFIG_SMP */ - -#ifdef CONFIG_PPC_KUEP -void setup_kuep(bool disabled) -{ - if (smp_processor_id() != boot_cpuid) - return; - - if (disabled) - patch_instruction_site(&patch__tlb_44x_kuep, ppc_inst(PPC_RAW_NOP())); - else - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - if (IS_ENABLED(CONFIG_PPC_47x) && disabled) - patch_instruction_site(&patch__tlb_47x_kuep, ppc_inst(PPC_RAW_NOP())); -} -#endif diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 60780e089118..ab1505cf42bf 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -8,18 +8,15 @@ */ #include <linux/memblock.h> -#include <linux/mmu_context.h> #include <linux/hugetlb.h> + #include <asm/fixmap.h> -#include <asm/code-patching.h> -#include <asm/inst.h> +#include <asm/pgalloc.h> #include <mm/mmu_decl.h> #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) -extern int __map_without_ltlbs; - static unsigned long block_mapped_ram; /* @@ -32,8 +29,6 @@ phys_addr_t v_block_mapped(unsigned long va) if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) return p + va - VIRT_IMMR_BASE; - if (__map_without_ltlbs) - return 0; if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) return __pa(va); return 0; @@ -49,59 +44,52 @@ unsigned long p_block_mapped(phys_addr_t pa) if (pa >= p && pa < p + IMMR_SIZE) return VIRT_IMMR_BASE + pa - p; - if (__map_without_ltlbs) - return 0; if (pa < block_mapped_ram) return (unsigned long)__va(pa); return 0; } -static pte_t __init *early_hugepd_alloc_kernel(hugepd_t *pmdp, unsigned long va) -{ - if (hpd_val(*pmdp) == 0) { - pte_t *ptep = memblock_alloc(sizeof(pte_basic_t), SZ_4K); - - if (!ptep) - return NULL; - - hugepd_populate_kernel((hugepd_t *)pmdp, ptep, PAGE_SHIFT_8M); - hugepd_populate_kernel((hugepd_t *)pmdp + 1, ptep, PAGE_SHIFT_8M); - } - return hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT); -} - static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa, pgprot_t prot, int psize, bool new) { pmd_t *pmdp = pmd_off_k(va); pte_t *ptep; - - if (WARN_ON(psize != MMU_PAGE_512K && psize != MMU_PAGE_8M)) - return -EINVAL; + unsigned int shift = mmu_psize_to_shift(psize); if (new) { if (WARN_ON(slab_is_available())) return -EINVAL; - if (psize == MMU_PAGE_512K) + if (psize == MMU_PAGE_8M) { + if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1)))) + return -EINVAL; + + ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + + ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + pmd_populate_kernel(&init_mm, pmdp + 1, ptep); + + ptep = (pte_t *)pmdp; + } else { ptep = early_pte_alloc_kernel(pmdp, va); - else - ptep = early_hugepd_alloc_kernel((hugepd_t *)pmdp, va); + /* The PTE should never be already present */ + if (WARN_ON(pte_present(*ptep) && pgprot_val(prot))) + return -EINVAL; + } } else { - if (psize == MMU_PAGE_512K) - ptep = pte_offset_kernel(pmdp, va); + if (psize == MMU_PAGE_8M) + ptep = (pte_t *)pmdp; else - ptep = hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT); + ptep = pte_offset_kernel(pmdp, va); } if (WARN_ON(!ptep)) return -ENOMEM; - /* The PTE should never be already present */ - if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot))) - return -EINVAL; - - set_huge_pte_at(&init_mm, va, ptep, pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot))); + set_huge_pte_at(&init_mm, va, ptep, + arch_make_huge_pte(pfn_pte(pa >> PAGE_SHIFT, prot), shift, 0), + 1UL << shift); return 0; } @@ -126,23 +114,30 @@ void __init mmu_mapin_immr(void) PAGE_KERNEL_NCG, MMU_PAGE_512K, true); } -static void mmu_mapin_ram_chunk(unsigned long offset, unsigned long top, - pgprot_t prot, bool new) +static int mmu_mapin_ram_chunk(unsigned long offset, unsigned long top, + pgprot_t prot, bool new) { unsigned long v = PAGE_OFFSET + offset; unsigned long p = offset; + int err = 0; - WARN_ON(!IS_ALIGNED(offset, SZ_512K) || !IS_ALIGNED(top, SZ_512K)); + WARN_ON(!IS_ALIGNED(offset, SZ_16K) || !IS_ALIGNED(top, SZ_16K)); - for (; p < ALIGN(p, SZ_8M) && p < top; p += SZ_512K, v += SZ_512K) - __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); - for (; p < ALIGN_DOWN(top, SZ_8M) && p < top; p += SZ_8M, v += SZ_8M) - __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new); - for (; p < ALIGN_DOWN(top, SZ_512K) && p < top; p += SZ_512K, v += SZ_512K) - __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); + for (; p < ALIGN(p, SZ_512K) && p < top && !err; p += SZ_16K, v += SZ_16K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new); + for (; p < ALIGN(p, SZ_8M) && p < top && !err; p += SZ_512K, v += SZ_512K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); + for (; p < ALIGN_DOWN(top, SZ_8M) && p < top && !err; p += SZ_8M, v += SZ_8M) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new); + for (; p < ALIGN_DOWN(top, SZ_512K) && p < top && !err; p += SZ_512K, v += SZ_512K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); + for (; p < ALIGN_DOWN(top, SZ_16K) && p < top && !err; p += SZ_16K, v += SZ_16K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new); if (!new) flush_tlb_kernel_range(PAGE_OFFSET + v, PAGE_OFFSET + top); + + return err; } unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) @@ -157,14 +152,11 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) mmu_mapin_immr(); - if (__map_without_ltlbs) - return 0; - - mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, true); + mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_X, true); if (debug_pagealloc_enabled_or_kfence()) { top = boundary; } else { - mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_TEXT, true); + mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_X, true); mmu_mapin_ram_chunk(einittext8, top, PAGE_KERNEL, true); } @@ -176,27 +168,34 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return top; } -void mmu_mark_initmem_nx(void) +int mmu_mark_initmem_nx(void) { unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M); unsigned long sinittext = __pa(_sinittext); unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8; unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + int err = 0; - mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, false); - mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); + if (!debug_pagealloc_enabled_or_kfence()) + err = mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); - mmu_pin_tlb(block_mapped_ram, false); + if (IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_pin_tlb(block_mapped_ram, false); + + return err; } #ifdef CONFIG_STRICT_KERNEL_RWX -void mmu_mark_rodata_ro(void) +int mmu_mark_rodata_ro(void) { unsigned long sinittext = __pa(_sinittext); + int err; - mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false); + err = mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false); if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) mmu_pin_tlb(block_mapped_ram, true); + + return err; } #endif @@ -210,33 +209,16 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, /* 8xx can only access 32MB at the moment */ memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M)); + + BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE) < TASK_SIZE); } -#ifdef CONFIG_PPC_KUEP -void __init setup_kuep(bool disabled) +int pud_clear_huge(pud_t *pud) { - if (disabled) - return; - - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - mtspr(SPRN_MI_AP, MI_APG_KUEP); + return 0; } -#endif - -#ifdef CONFIG_PPC_KUAP -struct static_key_false disable_kuap_key; -EXPORT_SYMBOL(disable_kuap_key); -void __init setup_kuap(bool disabled) +int pmd_clear_huge(pmd_t *pmd) { - if (disabled) { - static_branch_enable(&disable_kuap_key); - return; - } - - pr_info("Activating Kernel Userspace Access Protection\n"); - - mtspr(SPRN_MD_AP, MD_APG_KUAP); + return 0; } -#endif diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index 0424f6ce5bd8..cf60c776c883 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -1,19 +1,16 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - -obj-y += mmu_context.o tlb.o tlb_low.o -obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o -obj-$(CONFIG_40x) += 40x.o +obj-y += mmu_context.o tlb.o tlb_low.o kup.o +obj-$(CONFIG_PPC_BOOK3E_64) += tlb_64e.o tlb_low_64e.o book3e_pgtable.o obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o +obj-$(CONFIG_PPC_E500) += e500.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o +obj-$(CONFIG_PPC_E500) += e500_hugetlbpage.o endif # Disable kcov instrumentation on sensitive code # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb.o := n -KCOV_INSTRUMENT_fsl_booke.o := n +KCOV_INSTRUMENT_e500.o := n diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 77884e24281d..062e8785c1bb 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -10,6 +10,7 @@ #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/dma.h> +#include <asm/text-patching.h> #include <mm/mmu_decl.h> @@ -28,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start, _PAGE_KERNEL_RW; /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf); /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8; /* For each PTE for that area, map things. Note that we don't * increment phys because all PTEs are of the large size and @@ -70,7 +71,7 @@ static void __init *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +int __ref map_kernel_page(unsigned long ea, phys_addr_t pa, pgprot_t prot) { pgd_t *pgdp; p4d_t *p4dp; @@ -95,8 +96,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) pgdp = pgd_offset_k(ea); p4dp = p4d_offset(pgdp, ea); if (p4d_none(*p4dp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - p4d_populate(&init_mm, p4dp, pmdp); + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); } pudp = pud_offset(p4dp, ea); if (pud_none(*pudp)) { @@ -105,7 +106,7 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) } pmdp = pmd_offset(pudp, ea); if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); + ptep = early_alloc_pgtable(PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); @@ -115,3 +116,17 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) smp_wmb(); return 0; } + +void __patch_exception(int exc, unsigned long addr) +{ + unsigned int *ibase = &interrupt_base_book3e; + + /* + * Our exceptions vectors start with a NOP and -then- a branch + * to deal with single stepping from userspace which stops on + * the second instruction. Thus we need to patch the second + * instruction of the exception, not the first one. + */ + + patch_branch(ibase + (exc / 4) + 1, addr, 0); +} diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/e500.c index 03dacbe940e5..266fb22131fc 100644 --- a/arch/powerpc/mm/nohash/fsl_booke.c +++ b/arch/powerpc/mm/nohash/e500.c @@ -36,8 +36,8 @@ #include <linux/delay.h> #include <linux/highmem.h> #include <linux/memblock.h> +#include <linux/of_fdt.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/mmu.h> @@ -51,21 +51,15 @@ unsigned int tlbcam_index; -#define NUM_TLBCAMS (64) struct tlbcam TLBCAM[NUM_TLBCAMS]; -struct tlbcamrange { +static struct { unsigned long start; unsigned long limit; phys_addr_t phys; } tlbcam_addrs[NUM_TLBCAMS]; -unsigned long tlbcam_sz(int idx) -{ - return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; -} - -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* * Return PA for this VA if it is mapped by a CAM, or 0 */ @@ -122,15 +116,18 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; - TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_SW : 0; if (mmu_has_feature(MMU_FTR_BIG_PHYS)) TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(__pte(flags))) { - TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); + if (!is_kernel_addr(virt)) { + TLBCAM[index].MAS3 |= MAS3_UR; + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_UW : 0; + } else { + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; } tlbcam_addrs[index].start = virt; @@ -138,8 +135,8 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, tlbcam_addrs[index].phys = phys; } -unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys) +static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) { unsigned int camsize = __ilog2(ram); unsigned int align = __ffs(virt | phys); @@ -165,19 +162,38 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, unsigned long ram, int max_cam_idx, - bool dryrun) + bool dryrun, bool init) { int i; unsigned long amount_mapped = 0; + unsigned long boundary; + + if (strict_kernel_rwx_enabled()) + boundary = (unsigned long)(_sinittext - _stext); + else + boundary = ram; /* Calculate CAM values */ - for (i = 0; ram && i < max_cam_idx; i++) { + for (i = 0; boundary && i < max_cam_idx; i++) { + unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX; + + cam_sz = calc_cam_sz(boundary, virt, phys); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); + + boundary -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + for (ram -= amount_mapped; ram && i < max_cam_idx; i++) { unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL; cam_sz = calc_cam_sz(ram, virt, phys); if (!dryrun) - settlbcam(i, virt, phys, cam_sz, - pgprot_val(PAGE_KERNEL_X), 0); + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); ram -= cam_sz; amount_mapped += cam_sz; @@ -188,8 +204,13 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, if (dryrun) return amount_mapped; - loadcam_multi(0, i, max_cam_idx); - tlbcam_index = i; + if (init) { + loadcam_multi(0, i, max_cam_idx); + tlbcam_index = i; + } else { + loadcam_multi(0, i, 0); + WARN_ON(i > tlbcam_index); + } #ifdef CONFIG_PPC64 get_paca()->tcd.esel_next = i; @@ -200,12 +221,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, return amount_mapped; } -unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun) +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init) { unsigned long virt = PAGE_OFFSET; phys_addr_t phys = memstart_addr; - return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun); + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init); } #ifdef CONFIG_PPC32 @@ -237,6 +258,11 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } +static unsigned long __init tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + void __init adjust_total_lowmem(void) { unsigned long ram; @@ -246,8 +272,8 @@ void __init adjust_total_lowmem(void) ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); i = switch_to_as1(); - __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false); - restore_to_as0(i, 0, 0, 1); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true); + restore_to_as0(i, 0, NULL, 1); pr_info("Memory CAM mapping: "); for (i = 0; i < tlbcam_index - 1; i++) @@ -258,6 +284,26 @@ void __init adjust_total_lowmem(void) memblock_set_current_limit(memstart_addr + __max_low_memory); } +#ifdef CONFIG_STRICT_KERNEL_RWX +int mmu_mark_rodata_ro(void) +{ + unsigned long remapped; + + remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false); + + if (WARN_ON(__max_low_memory != remapped)) + return -EINVAL; + + return 0; +} +#endif + +int mmu_mark_initmem_nx(void) +{ + /* Everything is done in mmu_mark_rodata_ro() */ + return 0; +} + void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { @@ -317,11 +363,11 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) /* map a 64M area for the second relocation */ if (memstart_addr > start) map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, - false); + false, true); else map_mem_in_cams_addr(start, PAGE_OFFSET + offset, 0x4000000, CONFIG_LOWMEM_CAM_NUM, - false); + false, true); restore_to_as0(n, offset, __va(dt_ptr), 1); /* We should never reach here */ panic("Relocation error"); diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c index 8b88be91b622..a134d28a0e4d 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -45,7 +45,9 @@ static inline void book3e_tlb_lock(void) if (!cpu_has_feature(CPU_FTR_SMT)) return; - asm volatile("1: lbarx %0, 0, %1;" + asm volatile(".machine push;" + ".machine e6500;" + "1: lbarx %0, 0, %1;" "cmpwi %0, 0;" "bne 2f;" "stbcx. %2, 0, %1;" @@ -56,6 +58,7 @@ static inline void book3e_tlb_lock(void) "bne 2b;" "b 1b;" "3:" + ".machine pop;" : "=&r" (tmp) : "r" (&paca->tcd_ptr->lock), "r" (token) : "memory"); @@ -103,21 +106,11 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) int found = 0; mtspr(SPRN_MAS6, pid << 16); - if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { - asm volatile( - "li %0,0\n" - "tlbsx. 0,%1\n" - "bne 1f\n" - "li %0,1\n" - "1:\n" - : "=&r"(found) : "r"(ea)); - } else { - asm volatile( - "tlbsx 0,%1\n" - "mfspr %0,0x271\n" - "srwi %0,%0,31\n" - : "=&r"(found) : "r"(ea)); - } + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); return found; } @@ -142,7 +135,7 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) tsize = shift - 10; /* * We can't be interrupted while we're setting up the MAS - * regusters or after we've confirmed that no tlb exists. + * registers or after we've confirmed that no tlb exists. */ local_irq_save(flags); @@ -169,13 +162,9 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) mtspr(SPRN_MAS1, mas1); mtspr(SPRN_MAS2, mas2); - if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { - mtspr(SPRN_MAS7_MAS3, mas7_3); - } else { - if (mmu_has_feature(MMU_FTR_BIG_PHYS)) - mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); - mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); - } + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); asm volatile ("tlbwe"); @@ -189,7 +178,7 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { if (is_vm_hugetlb_page(vma)) book3e_hugetlb_preload(vma, address, *ptep); diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 4c74e8a5482b..5e4897daaaea 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -13,13 +13,12 @@ #include <linux/delay.h> #include <linux/memblock.h> #include <linux/libfdt.h> -#include <linux/crash_core.h> +#include <linux/crash_reserve.h> +#include <linux/of.h> +#include <linux/of_fdt.h> #include <asm/cacheflush.h> -#include <asm/prom.h> #include <asm/kdump.h> #include <mm/mmu_decl.h> -#include <generated/compile.h> -#include <generated/utsrelease.h> struct regions { unsigned long pa_start; @@ -36,17 +35,11 @@ struct regions { int reserved_mem_size_cells; }; -/* Simplified build-specific string for starting entropy. */ -static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; - struct regions __initdata regions; static __init void kaslr_get_cmdline(void *fdt) { - int node = fdt_path_offset(fdt, "/chosen"); - - early_init_dt_scan_chosen(node, "chosen", 1, boot_command_line); + early_init_dt_scan_chosen(boot_command_line); } static unsigned long __init rotate_xor(unsigned long hash, const void *area, @@ -72,7 +65,8 @@ static unsigned long __init get_boot_seed(void *fdt) { unsigned long hash = 0; - hash = rotate_xor(hash, build_str, sizeof(build_str)); + /* build-specific string for starting entropy. */ + hash = rotate_xor(hash, linux_banner, strlen(linux_banner)); hash = rotate_xor(hash, fdt, fdt_totalsize(fdt)); return hash; @@ -179,12 +173,12 @@ static __init bool overlaps_region(const void *fdt, u32 start, static void __init get_crash_kernel(void *fdt, unsigned long size) { -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_CRASH_RESERVE unsigned long long crash_size, crash_base; int ret; ret = parse_crashkernel(boot_command_line, size, &crash_size, - &crash_base); + &crash_base, NULL, NULL, NULL); if (ret != 0 || crash_size == 0) return; if (crash_base == 0) @@ -314,10 +308,10 @@ static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size pr_warn("KASLR: No safe seed for randomizing the kernel base.\n"); ram = min_t(phys_addr_t, __max_low_memory, size); - ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true); + ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, true); linear_sz = min_t(unsigned long, ram, SZ_512M); - /* If the linear size is smaller than 64M, do not randmize */ + /* If the linear size is smaller than 64M, do not randomize */ if (linear_sz < SZ_64M) return 0; @@ -382,7 +376,7 @@ notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size) create_kaslr_tlb_entry(1, tlb_virt, tlb_phys); } - /* Copy the kernel to it's new location and run */ + /* Copy the kernel to its new location and run */ memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz); flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz); diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c new file mode 100644 index 000000000000..c20c4f357fbf --- /dev/null +++ b/arch/powerpc/mm/nohash/kup.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for initializing kernel userspace protection + */ + +#include <linux/export.h> +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/smp.h> + +#include <asm/kup.h> +#include <asm/smp.h> + +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled) { + if (smp_processor_id() == boot_cpuid) + cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP; + return; + } + + pr_info("Activating Kernel Userspace Access Protection\n"); + + prevent_user_access(KUAP_READ_WRITE); +} +#endif diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index 44b2b5e7cabe..28a96a10c907 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -33,6 +33,7 @@ #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/smp.h> +#include <asm/kup.h> #include <mm/mmu_decl.h> @@ -202,25 +203,14 @@ static unsigned int steal_context_up(unsigned int id) static void set_context(unsigned long id, pgd_t *pgd) { if (IS_ENABLED(CONFIG_PPC_8xx)) { - s16 offset = (s16)(__pa(swapper_pg_dir)); - - /* - * Register M_TWB will contain base address of level 1 table minus the - * lower part of the kernel PGDIR base address, so that all accesses to - * level 1 table are done relative to lower part of kernel PGDIR base - * address. - */ - mtspr(SPRN_M_TWB, __pa(pgd) - offset); + mtspr(SPRN_M_TWB, __pa(pgd)); /* Update context */ mtspr(SPRN_M_CASID, id - 1); /* sync */ mb(); - } else { - if (IS_ENABLED(CONFIG_40x)) - mb(); /* sync */ - + } else if (kuap_is_disabled()) { mtspr(SPRN_PID, id); isync(); } @@ -305,6 +295,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, if (IS_ENABLED(CONFIG_BDI_SWITCH)) abatron_pteptrs[1] = next->pgd; set_context(id, next->pgd); +#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = id; +#endif raw_spin_unlock(&context_lock); } @@ -313,15 +306,6 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { - /* - * We have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we properly - * initialize context slice details for newly allocated mm's (which will - * have id == 0) and don't alter context slice inherited via fork (which - * will have id != 0). - */ - if (mm->context.id == 0) - slice_init_new_context_exec(mm); mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; pte_frag_set(&mm->context, NULL); @@ -393,21 +377,11 @@ void __init mmu_context_init(void) /* * Allocate the maps used by context management */ - context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!context_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), + context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); + context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1), SMP_CACHE_BYTES); - if (!context_mm) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(void *) * (LAST_CONTEXT + 1)); if (IS_ENABLED(CONFIG_SMP)) { - stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!stale_map[boot_cpuid]) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - + stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, "powerpc/mmu/ctx:prepare", mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 5872f69141d5..0a650742f3a0 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -37,7 +37,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/cputhreads.h> #include <asm/hugetlb.h> #include <asm/paca.h> @@ -49,39 +49,44 @@ * other sizes not listed here. The .ind field is only used on MMUs that have * indirect page table entries. */ -#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx) -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, - .enc = BOOK3E_PAGESZ_4K, }, [MMU_PAGE_2M] = { .shift = 21, - .enc = BOOK3E_PAGESZ_2M, }, [MMU_PAGE_4M] = { .shift = 22, - .enc = BOOK3E_PAGESZ_4M, }, [MMU_PAGE_16M] = { .shift = 24, - .enc = BOOK3E_PAGESZ_16M, }, [MMU_PAGE_64M] = { .shift = 26, - .enc = BOOK3E_PAGESZ_64M, }, [MMU_PAGE_256M] = { .shift = 28, - .enc = BOOK3E_PAGESZ_256M, }, [MMU_PAGE_1G] = { .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, }, }; -#elif defined(CONFIG_PPC_8xx) + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].shift - 10; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif + +#ifdef CONFIG_PPC_8xx struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, @@ -96,78 +101,9 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { .shift = 23, }, }; -#else -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - [MMU_PAGE_4K] = { - .shift = 12, - .ind = 20, - .enc = BOOK3E_PAGESZ_4K, - }, - [MMU_PAGE_16K] = { - .shift = 14, - .enc = BOOK3E_PAGESZ_16K, - }, - [MMU_PAGE_64K] = { - .shift = 16, - .ind = 28, - .enc = BOOK3E_PAGESZ_64K, - }, - [MMU_PAGE_1M] = { - .shift = 20, - .enc = BOOK3E_PAGESZ_1M, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .ind = 36, - .enc = BOOK3E_PAGESZ_16M, - }, - [MMU_PAGE_256M] = { - .shift = 28, - .enc = BOOK3E_PAGESZ_256M, - }, - [MMU_PAGE_1G] = { - .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, - }, -}; -#endif /* CONFIG_FSL_BOOKE */ - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} -#else -static inline int mmu_get_tsize(int psize) -{ - /* This isn't used on !Book3E for now */ - return 0; -} -#endif /* CONFIG_PPC_BOOK3E_MMU */ - -/* The variables below are currently only used on 64-bit Book3E - * though this will probably be made common with other nohash - * implementations at some point - */ -#ifdef CONFIG_PPC64 - -int mmu_linear_psize; /* Page size used for the linear mapping */ -int mmu_pte_psize; /* Page size used for PTE pages */ -int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ -int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ -unsigned long linear_map_top; /* Top of linear mapping */ - - -/* - * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug - * exceptions. This is used for bolted and e6500 TLB miss handlers which - * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, - * this is set to zero. - */ -int extlb_level_exc; - -#endif /* CONFIG_PPC64 */ +#endif -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 /* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ DEFINE_PER_CPU(int, next_tlbcam_idx); EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); @@ -185,6 +121,7 @@ EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); * processor */ +#ifndef CONFIG_PPC_8xx /* * These are the base non-SMP variants of page and mm flushing */ @@ -219,6 +156,15 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) } EXPORT_SYMBOL(local_flush_tlb_page); +void local_flush_tlb_page_psize(struct mm_struct *mm, + unsigned long vmaddr, int psize) +{ + __local_flush_tlb_page(mm, vmaddr, mmu_get_tsize(psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page_psize); + +#endif + /* * And here are the SMP non-local implementations */ @@ -343,20 +289,10 @@ EXPORT_SYMBOL(flush_tlb_page); #endif /* CONFIG_SMP */ -#ifdef CONFIG_PPC_47x -void __init early_init_mmu_47x(void) -{ -#ifdef CONFIG_SMP - unsigned long root = of_get_flat_dt_root(); - if (of_get_flat_dt_prop(root, "cooperative-partition", NULL)) - mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); -#endif /* CONFIG_SMP */ -} -#endif /* CONFIG_PPC_47x */ - /* * Flush kernel TLB entries in the given range */ +#ifndef CONFIG_PPC_8xx void flush_tlb_kernel_range(unsigned long start, unsigned long end) { #ifdef CONFIG_SMP @@ -369,6 +305,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) #endif } EXPORT_SYMBOL(flush_tlb_kernel_range); +#endif /* * Currently, for range flushing, we just do a full mm flush. This should @@ -392,397 +329,13 @@ void tlb_flush(struct mmu_gather *tlb) flush_tlb_mm(tlb->mm); } -/* - * Below are functions specific to the 64-bit variant of Book3E though that - * may change in the future - */ - -#ifdef CONFIG_PPC64 - -/* - * Handling of virtual linear page tables or indirect TLB entries - * flushing when PTE pages are freed - */ -void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) -{ - int tsize = mmu_psize_defs[mmu_pte_psize].enc; - - if (book3e_htw_mode != PPC_HTW_NONE) { - unsigned long start = address & PMD_MASK; - unsigned long end = address + PMD_SIZE; - unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; - - /* This isn't the most optimal, ideally we would factor out the - * while preempt & CPU mask mucking around, or even the IPI but - * it will do for now - */ - while (start < end) { - __flush_tlb_page(tlb->mm, start, tsize, 1); - start += size; - } - } else { - unsigned long rmask = 0xf000000000000000ul; - unsigned long rid = (address & rmask) | 0x1000000000000000ul; - unsigned long vpte = address & ~rmask; - - vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; - vpte |= rid; - __flush_tlb_page(tlb->mm, vpte, tsize, 0); - } -} - -static void setup_page_sizes(void) -{ - unsigned int tlb0cfg; - unsigned int tlb0ps; - unsigned int eptcfg; - int i, psize; - -#ifdef CONFIG_PPC_FSL_BOOK3E - unsigned int mmucfg = mfspr(SPRN_MMUCFG); - int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { - unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); - unsigned int min_pg, max_pg; - - min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; - max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def; - unsigned int shift; - - def = &mmu_psize_defs[psize]; - shift = def->shift; - - if (shift == 0 || shift & 1) - continue; - - /* adjust to be in terms of 4^shift Kb */ - shift = (shift - 10) >> 1; - - if ((shift >= min_pg) && (shift <= max_pg)) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - goto out; - } - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { - u32 tlb1cfg, tlb1ps; - - tlb0cfg = mfspr(SPRN_TLB0CFG); - tlb1cfg = mfspr(SPRN_TLB1CFG); - tlb1ps = mfspr(SPRN_TLB1PS); - eptcfg = mfspr(SPRN_EPTCFG); - - if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) - book3e_htw_mode = PPC_HTW_E6500; - - /* - * We expect 4K subpage size and unrestricted indirect size. - * The lack of a restriction on indirect size is a Freescale - * extension, indicated by PSn = 0 but SPSn != 0. - */ - if (eptcfg != 2) - book3e_htw_mode = PPC_HTW_NONE; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (!def->shift) - continue; - - if (tlb1ps & (1U << (def->shift - 10))) { - def->flags |= MMU_PAGE_SIZE_DIRECT; - - if (book3e_htw_mode && psize == MMU_PAGE_2M) - def->flags |= MMU_PAGE_SIZE_INDIRECT; - } - } - - goto out; - } -#endif - - tlb0cfg = mfspr(SPRN_TLB0CFG); - tlb0ps = mfspr(SPRN_TLB0PS); - eptcfg = mfspr(SPRN_EPTCFG); - - /* Look for supported direct sizes */ - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (tlb0ps & (1U << (def->shift - 10))) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - /* Indirect page sizes supported ? */ - if ((tlb0cfg & TLBnCFG_IND) == 0 || - (tlb0cfg & TLBnCFG_PT) == 0) - goto out; - - book3e_htw_mode = PPC_HTW_IBM; - - /* Now, we only deal with one IND page size for each - * direct size. Hopefully all implementations today are - * unambiguous, but we might want to be careful in the - * future. - */ - for (i = 0; i < 3; i++) { - unsigned int ps, sps; - - sps = eptcfg & 0x1f; - eptcfg >>= 5; - ps = eptcfg & 0x1f; - eptcfg >>= 5; - if (!ps || !sps) - continue; - for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (ps == (def->shift - 10)) - def->flags |= MMU_PAGE_SIZE_INDIRECT; - if (sps == (def->shift - 10)) - def->ind = ps + 10; - } - } - -out: - /* Cleanup array and print summary */ - pr_info("MMU: Supported page sizes\n"); - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - const char *__page_type_names[] = { - "unsupported", - "direct", - "indirect", - "direct & indirect" - }; - if (def->flags == 0) { - def->shift = 0; - continue; - } - pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), - __page_type_names[def->flags & 0x3]); - } -} - -static void setup_mmu_htw(void) -{ - /* - * If we want to use HW tablewalk, enable it by patching the TLB miss - * handlers to branch to the one dedicated to it. - */ - - switch (book3e_htw_mode) { - case PPC_HTW_IBM: - patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); - break; -#ifdef CONFIG_PPC_FSL_BOOK3E - case PPC_HTW_E6500: - extlb_level_exc = EX_TLB_SIZE; - patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); - break; -#endif - } - pr_info("MMU: Book3E HW tablewalk %s\n", - book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); -} - -/* - * Early initialization of the MMU TLB code - */ -static void early_init_this_mmu(void) -{ - unsigned int mas4; - - /* Set MAS4 based on page table setting */ - - mas4 = 0x4 << MAS4_WIMGED_SHIFT; - switch (book3e_htw_mode) { - case PPC_HTW_E6500: - mas4 |= MAS4_INDD; - mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; - mas4 |= MAS4_TLBSELD(1); - mmu_pte_psize = MMU_PAGE_2M; - break; - - case PPC_HTW_IBM: - mas4 |= MAS4_INDD; - mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_1M; - break; - - case PPC_HTW_NONE: - mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; - mmu_pte_psize = mmu_virtual_psize; - break; - } - mtspr(SPRN_MAS4, mas4); - -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned int num_cams; - bool map = true; - - /* use a quarter of the TLBCAM for bolted linear map */ - num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - - /* - * Only do the mapping once per core, or else the - * transient mapping would cause problems. - */ -#ifdef CONFIG_SMP - if (hweight32(get_tensr()) > 1) - map = false; -#endif - - if (map) - linear_map_top = map_mem_in_cams(linear_map_top, - num_cams, false); - } -#endif - - /* A sync won't hurt us after mucking around with - * the MMU configuration - */ - mb(); -} - -static void __init early_init_mmu_global(void) -{ - /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it. Ideally - * we should find out a suitable page size and patch the - * TLB miss code (either that or use the PACA to store - * the value we want) - */ - mmu_linear_psize = MMU_PAGE_1G; - - /* XXX This should be decided at runtime based on supported - * page sizes in the TLB, but for now let's assume 16M is - * always there and a good fit (which it probably is) - * - * Freescale booke only supports 4K pages in TLB0, so use that. - */ - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) - mmu_vmemmap_psize = MMU_PAGE_4K; - else - mmu_vmemmap_psize = MMU_PAGE_16M; - - /* XXX This code only checks for TLB 0 capabilities and doesn't - * check what page size combos are supported by the HW. It - * also doesn't handle the case where a separate array holds - * the IND entries from the array loaded by the PT. - */ - /* Look for supported page sizes */ - setup_page_sizes(); - - /* Look for HW tablewalk support */ - setup_mmu_htw(); - -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - if (book3e_htw_mode == PPC_HTW_NONE) { - extlb_level_exc = EX_TLB_SIZE; - patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); - patch_exception(0x1e0, - exc_instruction_tlb_miss_bolted_book3e); - } - } -#endif - - /* Set the global containing the top of the linear mapping - * for use by the TLB miss code - */ - linear_map_top = memblock_end_of_DRAM(); - - ioremap_bot = IOREMAP_BASE; -} - -static void __init early_mmu_set_memory_limit(void) -{ -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - /* - * Limit memory so we dont have linear faults. - * Unlike memblock_set_current_limit, which limits - * memory available during early boot, this permanently - * reduces the memory available to Linux. We need to - * do this because highmem is not supported on 64-bit. - */ - memblock_enforce_memory_limit(linear_map_top); - } -#endif - - memblock_set_current_limit(linear_map_top); -} - -/* boot cpu only */ -void __init early_init_mmu(void) -{ - early_init_mmu_global(); - early_init_this_mmu(); - early_mmu_set_memory_limit(); -} - -void early_init_mmu_secondary(void) -{ - early_init_this_mmu(); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* On non-FSL Embedded 64-bit, we adjust the RMA size to match - * the bolted TLB entry. We know for now that only 1G - * entries are supported though that may eventually - * change. - * - * on FSL Embedded 64-bit, usually all RAM is bolted, but with - * unusual memory sizes it's possible for some RAM to not be mapped - * (such RAM is not used at all by Linux, since we don't support - * highmem on 64-bit). We limit ppc64_rma_size to what would be - * mappable if this memblock is the only one. Additional memblocks - * can only increase, not decrease, the amount that ends up getting - * mapped. We still limit max to 1G even if we'll eventually map - * more. This is due to what the early init code is set up to do. - * - * We crop it to the size of the first MEMBLOCK to - * avoid going over total available memory just in case... - */ -#ifdef CONFIG_PPC_FSL_BOOK3E - if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned long linear_sz; - unsigned int num_cams; - - /* use a quarter of the TLBCAM for bolted linear map */ - num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - - linear_sz = map_mem_in_cams(first_memblock_size, num_cams, - true); - - ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); - } else -#endif - ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); - - /* Finally limit subsequent allocations */ - memblock_set_current_limit(first_memblock_base + ppc64_rma_size); -} -#else /* ! CONFIG_PPC64 */ +#ifndef CONFIG_PPC64 void __init early_init_mmu(void) { -#ifdef CONFIG_PPC_47x - early_init_mmu_47x(); -#endif + unsigned long root = of_get_flat_dt_root(); -#ifdef CONFIG_PPC_MM_SLICES - mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); -#endif + if (IS_ENABLED(CONFIG_PPC_47x) && IS_ENABLED(CONFIG_SMP) && + of_get_flat_dt_prop(root, "cooperative-partition", NULL)) + mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c new file mode 100644 index 000000000000..4f925adf2695 --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb_64e.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/memblock.h> + +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/text-patching.h> +#include <asm/cputhreads.h> + +#include <mm/mmu_decl.h> + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +static int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ +int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ +unsigned long linear_map_top; /* Top of linear mapping */ + + +/* + * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug + * exceptions. This is used for bolted and e6500 TLB miss handlers which + * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, + * this is set to zero. + */ +int extlb_level_exc; + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10; + + if (book3e_htw_mode != PPC_HTW_NONE) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +static void __init setup_page_sizes(void) +{ + unsigned int tlb0cfg; + unsigned int eptcfg; + int psize; + + unsigned int mmucfg = mfspr(SPRN_MMUCFG); + + if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); + unsigned int min_pg, max_pg; + + min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; + max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def; + unsigned int shift; + + def = &mmu_psize_defs[psize]; + shift = def->shift; + + if (shift == 0 || shift & 1) + continue; + + /* adjust to be in terms of 4^shift Kb */ + shift = (shift - 10) >> 1; + + if ((shift >= min_pg) && (shift <= max_pg)) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + goto out; + } + + if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { + u32 tlb1cfg, tlb1ps; + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb1cfg = mfspr(SPRN_TLB1CFG); + tlb1ps = mfspr(SPRN_TLB1PS); + eptcfg = mfspr(SPRN_EPTCFG); + + if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) + book3e_htw_mode = PPC_HTW_E6500; + + /* + * We expect 4K subpage size and unrestricted indirect size. + * The lack of a restriction on indirect size is a Freescale + * extension, indicated by PSn = 0 but SPSn != 0. + */ + if (eptcfg != 2) + book3e_htw_mode = PPC_HTW_NONE; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (!def->shift) + continue; + + if (tlb1ps & (1U << (def->shift - 10))) { + def->flags |= MMU_PAGE_SIZE_DIRECT; + + if (book3e_htw_mode && psize == MMU_PAGE_2M) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + } + } + + goto out; + } +out: + /* Cleanup array and print summary */ + pr_info("MMU: Supported page sizes\n"); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + const char *__page_type_names[] = { + "unsupported", + "direct", + "indirect", + "direct & indirect" + }; + if (def->flags == 0) { + def->shift = 0; + continue; + } + pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), + __page_type_names[def->flags & 0x3]); + } +} + +/* + * Early initialization of the MMU TLB code + */ +static void early_init_this_mmu(void) +{ + unsigned int mas4; + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + mas4 |= MAS4_INDD; + mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; + mas4 |= MAS4_TLBSELD(1); + mmu_pte_psize = MMU_PAGE_2M; + break; + + case PPC_HTW_NONE: + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; + mmu_pte_psize = mmu_virtual_psize; + break; + } + mtspr(SPRN_MAS4, mas4); + + unsigned int num_cams; + bool map = true; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + /* + * Only do the mapping once per core, or else the + * transient mapping would cause problems. + */ +#ifdef CONFIG_SMP + if (hweight32(get_tensr()) > 1) + map = false; +#endif + + if (map) + linear_map_top = map_mem_in_cams(linear_map_top, + num_cams, false, true); + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +static void __init early_init_mmu_global(void) +{ + /* + * Freescale booke only supports 4K pages in TLB0, so use that. + */ + mmu_vmemmap_psize = MMU_PAGE_4K; + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + /* Look for supported page sizes */ + setup_page_sizes(); + + /* + * If we want to use HW tablewalk, enable it by patching the TLB miss + * handlers to branch to the one dedicated to it. + */ + extlb_level_exc = EX_TLB_SIZE; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); + break; + } + + pr_info("MMU: Book3E HW tablewalk %s\n", + book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = memblock_end_of_DRAM(); + + ioremap_bot = IOREMAP_BASE; +} + +static void __init early_mmu_set_memory_limit(void) +{ + /* + * Limit memory so we dont have linear faults. + * Unlike memblock_set_current_limit, which limits + * memory available during early boot, this permanently + * reduces the memory available to Linux. We need to + * do this because highmem is not supported on 64-bit. + */ + memblock_enforce_memory_limit(linear_map_top); + + memblock_set_current_limit(linear_map_top); +} + +/* boot cpu only */ +void __init early_init_mmu(void) +{ + early_init_mmu_global(); + early_init_this_mmu(); + early_mmu_set_memory_limit(); +} + +void early_init_mmu_secondary(void) +{ + early_init_this_mmu(); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * On FSL Embedded 64-bit, usually all RAM is bolted, but with + * unusual memory sizes it's possible for some RAM to not be mapped + * (such RAM is not used at all by Linux, since we don't support + * highmem on 64-bit). We limit ppc64_rma_size to what would be + * mappable if this memblock is the only one. Additional memblocks + * can only increase, not decrease, the amount that ends up getting + * mapped. We still limit max to 1G even if we'll eventually map + * more. This is due to what the early init code is set up to do. + * + * We crop it to the size of the first MEMBLOCK to + * avoid going over total available memory just in case... + */ + unsigned long linear_sz; + unsigned int num_cams; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + linear_sz = map_mem_in_cams(first_memblock_size, num_cams, true, true); + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(first_memblock_base + ppc64_rma_size); +} diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index 4613bf8e9aae..c4d296e73731 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -32,32 +32,7 @@ #include <asm/asm-compat.h> #include <asm/feature-fixups.h> -#if defined(CONFIG_40x) - -/* - * 40x implementation needs only tlbil_va - */ -_GLOBAL(__tlbil_va) - /* We run the search with interrupts disabled because we have to change - * the PID and I don't want to preempt when that happens. - */ - mfmsr r5 - mfspr r6,SPRN_PID - wrteei 0 - mtspr SPRN_PID,r4 - tlbsx. r3, 0, r3 - mtspr SPRN_PID,r6 - wrtee r5 - bne 1f - sync - /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is - * clear. Since 25 is the V bit in the TLB_TAG, loading this value - * will invalidate the TLB entry. */ - tlbwe r3, r3, TLB_TAG - isync -1: blr - -#elif defined(CONFIG_PPC_8xx) +#if defined(CONFIG_PPC_8xx) /* * Nothing to do for 8xx, everything is inline @@ -186,7 +161,7 @@ _GLOBAL(_tlbivax_bcast) isync PPC_TLBIVAX(0, R3) isync - eieio + mbar tlbsync BEGIN_FTR_SECTION b 1f @@ -199,7 +174,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) * Touch enough instruction cache lines to ensure cache hits */ 1: mflr r9 - bl 2f + bcl 20,31,$+4 2: mflr r6 li r7,32 PPC_ICBT(0,R6,R7) /* touch next cache line */ @@ -221,7 +196,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) blr #endif /* CONFIG_PPC_47x */ -#elif defined(CONFIG_FSL_BOOKE) +#elif defined(CONFIG_PPC_85xx) /* * FSL BookE implementations. * @@ -294,7 +269,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) /* * New Book3E (>= 2.06) implementation * @@ -355,7 +330,7 @@ _GLOBAL(_tlbivax_bcast) rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND 1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ PPC_TLBIVAX(0,R3) - eieio + mbar tlbsync sync wrtee r10 @@ -364,12 +339,12 @@ _GLOBAL(_tlbivax_bcast) #error Unsupported processor type ! #endif -#if defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_E500) /* * extern void loadcam_entry(unsigned int index) * * Load TLBCAM[index] entry in to the L2 CAM MMU - * Must preserve r7, r8, r9, r10 and r11 + * Must preserve r7, r8, r9, r10, r11, r12 */ _GLOBAL(loadcam_entry) mflr r5 @@ -401,7 +376,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) * * r3 = first entry to write * r4 = number of entries to write - * r5 = temporary tlb entry + * r5 = temporary tlb entry (0 means no switch to AS1) */ _GLOBAL(loadcam_multi) mflr r8 @@ -409,12 +384,14 @@ _GLOBAL(loadcam_multi) mfmsr r11 andi. r11,r11,MSR_IS bne 10f + mr. r12, r5 + beq 10f /* * Set up temporary TLB entry that is the same as what we're * running from, but in AS=1. */ - bl 1f + bcl 20,31,$+4 1: mflr r6 tlbsx 0,r8 mfspr r6,SPRN_MAS1 @@ -446,6 +423,8 @@ _GLOBAL(loadcam_multi) /* Don't return to AS=0 if we were in AS=1 at function start */ andi. r11,r11,MSR_IS bne 3f + cmpwi r12, 0 + beq 3f /* Return to AS=0 and clear the temporary entry */ mfmsr r6 diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index bf24451f3e71..de568297d5c5 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -61,7 +61,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) ld r14,PACAPGD(r13) std r15,EX_TLB_R15(r12) std r10,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E START_BTB_FLUSH_SECTION mfspr r11, SPRN_SRR1 andi. r10,r11,MSR_PR @@ -70,14 +69,11 @@ START_BTB_FLUSH_SECTION 1: END_BTB_FLUSH_SECTION std r7,EX_TLB_R7(r12) -#endif .endm .macro tlb_epilog_bolted ld r14,EX_TLB_CR(r12) -#ifdef CONFIG_PPC_FSL_BOOK3E ld r7,EX_TLB_R7(r12) -#endif ld r10,EX_TLB_R10(r12) ld r11,EX_TLB_R11(r12) ld r13,EX_TLB_R13(r12) @@ -128,6 +124,13 @@ END_BTB_FLUSH_SECTION bne tlb_miss_kernel_bolted +tlb_miss_user_bolted: +#ifdef CONFIG_PPC_KUAP + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- tlb_miss_fault_bolted /* KUAP fault */ +#endif + tlb_miss_common_bolted: /* * This is the guts of the TLB miss handler for bolted-linear. @@ -145,16 +148,7 @@ tlb_miss_common_bolted: clrrdi r15,r15,3 beq tlb_miss_fault_bolted /* No PGDIR, bail */ -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ldx r14,r14,r15 /* grab pgd entry */ - beq tlb_miss_done_bolted /* tlb exists already, bail */ -MMU_FTR_SECTION_ELSE ldx r14,r14,r15 /* grab pgd entry */ -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 @@ -215,14 +209,15 @@ itlb_miss_kernel_bolted: tlb_miss_kernel_bolted: mfspr r10,SPRN_MAS1 ld r14,PACA_KERNELPGD(r13) - cmpldi cr0,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + andi. r15,r15,1 /* Check for vmalloc region */ rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 - beq+ tlb_miss_common_bolted + bne+ tlb_miss_common_bolted tlb_miss_fault_bolted: /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX + andi. r10,r11,_PAGE_BAP_UX|_PAGE_BAP_SX bne itlb_miss_fault_bolted dtlb_miss_fault_bolted: tlb_epilog_bolted @@ -239,17 +234,16 @@ itlb_miss_fault_bolted: srdi r15,r16,60 /* get region */ bne- itlb_miss_fault_bolted - li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ + li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */ /* We do the user/kernel test for the PID here along with the RW test */ cmpldi cr0,r15,0 /* Check for user region */ oris r11,r11,_PAGE_ACCESSED@h - beq tlb_miss_common_bolted + beq tlb_miss_user_bolted b itlb_miss_kernel_bolted -#ifdef CONFIG_PPC_FSL_BOOK3E /* * TLB miss handling for e6500 and derivatives, using hardware tablewalk. * @@ -357,7 +351,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) mfspr r15,SPRN_MAS2 isync - tlbilxva 0,r15 + PPC_TLBILX_VA(0,R15) isync mtspr SPRN_MAS6,r10 @@ -456,11 +450,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT) tlb_miss_huge_e6500: beq tlb_miss_fault_e6500 - li r10,1 - andi. r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */ - rldimi r14,r10,63,0 /* Set PD_HUGE */ - xor r14,r14,r15 /* Clear size bits */ - ldx r14,0,r14 + rlwinm r15,r14,32-_PAGE_PSIZE_SHIFT,0x1e /* * Now we build the MAS for a huge page. @@ -471,7 +461,6 @@ tlb_miss_huge_e6500: * MAS 2,3+7: Needs to be redone similar to non-tablewalk handler */ - subi r15,r15,10 /* Convert psize to tsize */ mfspr r10,SPRN_MAS1 rlwinm r10,r10,0,~MAS1_IND rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK @@ -500,7 +489,9 @@ tlb_miss_huge_e6500: tlb_miss_kernel_e6500: ld r14,PACA_KERNELPGD(r13) - cmpldi cr1,r15,8 /* Check for vmalloc region */ + srdi r15,r16,44 /* get kernel region */ + xoris r15,r15,0xc /* Check for vmalloc region */ + cmplwi cr1,r15,1 beq+ cr1,tlb_miss_common_e6500 tlb_miss_fault_e6500: @@ -514,237 +505,6 @@ dtlb_miss_fault_e6500: itlb_miss_fault_e6500: tlb_epilog_bolted b exc_instruction_storage_book3e -#endif /* CONFIG_PPC_FSL_BOOK3E */ - -/********************************************************************** - * * - * TLB miss handling for Book3E with TLB reservation and HES support * - * * - **********************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - - /* The page tables are mapped virtually linear. At this point, though, - * we don't know whether we are trying to fault in a first level - * virtual address or a virtual page table address. We can get that - * from bit 0x1 of the region ID which we have set for a page table - */ - andi. r10,r15,0x1 - bne- virt_page_table_tlb_miss - - std r14,EX_TLB_ESR(r12); /* save ESR */ - std r16,EX_TLB_DEAR(r12); /* save DEAR */ - - /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ - li r11,_PAGE_PRESENT - oris r11,r11,_PAGE_ACCESSED@h - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r15,0 /* Check for user region */ - - /* We pre-test some combination of permissions to avoid double - * faults: - * - * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE - * ESR_ST is 0x00800000 - * _PAGE_BAP_SW is 0x00000010 - * So the shift is >> 19. This tests for supervisor writeability. - * If the page happens to be supervisor writeable and not user - * writeable, we will take a new fault later, but that should be - * a rare enough case. - * - * We also move ESR_ST in _PAGE_DIRTY position - * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 - * - * MAS1 is preset for all we need except for TID that needs to - * be cleared for kernel translations - */ - rlwimi r11,r14,32-19,27,27 - rlwimi r11,r14,32-16,19,19 - beq normal_tlb_miss - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by writing a crazy value in ESR in our exception frame - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - - cmpldi cr0,r15,0 /* Check for user region */ - std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ - beq normal_tlb_miss - - li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - /* XXX replace the RMW cycles with immediate loads + writes */ - mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -/* - * This is the guts of the first-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = faulting address - * r15 = region ID - * r14 = crap (free to use) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = PTE permission mask - * r10 = crap (free to use) - */ -normal_tlb_miss: - /* So we first construct the page table address. We do that by - * shifting the bottom of the address (not the region ID) by - * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and - * or'ing the fourth high bit. - * - * NOTE: For 64K pages, we do things slightly differently in - * order to handle the weird page table format used by linux - */ - ori r10,r15,0x1 - rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 - sldi r15,r10,60 - clrrdi r14,r14,3 - or r10,r15,r14 - -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ld r14,0(r10) - beq normal_tlb_miss_done -MMU_FTR_SECTION_ELSE - ld r14,0(r10) -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) - -finish_normal_tlb_miss: - /* Check if required permissions are met */ - andc. r15,r11,r14 - bne- normal_tlb_miss_access_fault - - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE need change if !base page size, not - * yet implemented for now - * MAS 2 : Defaults not useful, need to be redone - * MAS 3+7 : Needs to be done - * - * TODO: mix up code below for better scheduling - */ - clrrdi r11,r16,12 /* Clear low crap in EA */ - rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ - mtspr SPRN_MAS2,r11 - - /* Check page size, if not standard, update MAS1 */ - rldicl r11,r14,64-8,64-8 - cmpldi cr0,r11,BOOK3E_PAGESZ_4K - beq- 1f - mfspr r11,SPRN_MAS1 - rlwimi r11,r14,31,21,24 - rlwinm r11,r11,0,21,19 - mtspr SPRN_MAS1,r11 -1: - /* Move RPN in position */ - rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT - clrldi r15,r11,12 /* Clear crap at the top */ - rlwimi r15,r14,32-8,22,25 /* Move in U bits */ - rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ - - /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ - andi. r11,r14,_PAGE_DIRTY - bne 1f - li r11,MAS3_SW|MAS3_UW - andc r15,r15,r11 -1: -BEGIN_MMU_FTR_SECTION - srdi r16,r15,32 - mtspr SPRN_MAS3,r15 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r15 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -normal_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_EPILOG_SUCCESS - rfi - -normal_tlb_miss_access_fault: - /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_EXEC - bne 1f - ld r14,EX_TLB_DEAR(r12) - ld r15,EX_TLB_ESR(r12) - mtspr SPRN_DEAR,r14 - mtspr SPRN_ESR,r15 - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - /* * This is the guts of the second-level TLB miss handler for direct @@ -772,6 +532,7 @@ normal_tlb_miss_access_fault: */ virt_page_table_tlb_miss: /* Are we hitting a kernel page table ? */ + srdi r15,r16,60 andi. r10,r15,0x8 /* The cool thing now is that r10 contains 0 for user and 8 for kernel, @@ -786,19 +547,22 @@ virt_page_table_tlb_miss: mfspr r10,SPRN_MAS1 rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 +#ifdef CONFIG_PPC_KUAP + b 2f 1: -BEGIN_MMU_FTR_SECTION - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - */ - PPC_TLBSRX_DOT(0,R16) - beq virt_page_table_tlb_miss_done -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- virt_page_table_tlb_miss_fault /* KUAP fault */ +2: +#else +1: +#endif /* Now, we need to walk the page tables. First check if we are in * range. */ - rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + rldicl r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + cmpldi r10,0x80 bne- virt_page_table_tlb_miss_fault /* Get the PGD pointer */ @@ -844,41 +608,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) clrldi r11,r15,4 /* remove region ID from RPN */ ori r10,r11,1 /* Or-in SR */ -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe -BEGIN_MMU_FTR_SECTION -virt_page_table_tlb_miss_done: - - /* We have overridden MAS2:EPN but currently our primary TLB miss - * handler will always restore it so that should not be an issue, - * if we ever optimize the primary handler to not write MAS2 on - * some cases, we'll have to restore MAS2:EPN here based on the - * original fault's DEAR. If we do that we have to modify the - * ITLB miss handler to also store SRR0 in the exception frame - * as DEAR. - * - * However, one nasty thing we did is we cleared the reservation - * (well, potentially we did). We do a trick here thus if we - * are not a level 0 exception (we interrupted the TLB miss) we - * offset the return address by -4 in order to replay the tlbsrx - * instruction there - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- 1f - ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) - addi r10,r11,-4 - std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) -1: -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Return to caller, normal case */ TLB_MISS_EPILOG_SUCCESS rfi @@ -927,198 +662,6 @@ virt_page_table_tlb_miss_whacko_fault: TLB_MISS_EPILOG_ERROR b exc_data_storage_book3e - -/************************************************************** - * * - * TLB miss handling for Book3E with hw page table support * - * * - **************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss_htw) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r11,0 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss_htw) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by keeping a crazy value for ESR in r14 - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r11,0 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - - -/* - * This is the guts of the second-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = virtual page table faulting address - * r15 = PGD pointer - * r14 = ESR - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * It can be re-entered by the linear mapping miss handler. However, to - * avoid too much complication, it will save/restore things for us - */ -htw_tlb_miss: - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - * - * MAS1:IND should be already set based on MAS4 - */ - PPC_TLBSRX_DOT(0,R16) - beq htw_tlb_miss_done - - /* Now, we need to walk the page tables. First check if we are in - * range. - */ - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - bne- htw_tlb_miss_fault - - /* Get the PGD pointer */ - cmpldi cr0,r15,0 - beq- htw_tlb_miss_fault - - /* Get to PGD entry */ - rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Get to PUD entry */ - rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Get to PMD entry */ - rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Ok, we're all right, we can now create an indirect entry for - * a 1M or 256M page. - * - * The last trick is now that because we use "half" pages for - * the HTW (1M IND is 2K and 256M IND is 32K) we need to account - * for an added LSB bit to the RPN. For 64K pages, there is no - * problem as we already use 32K arrays (half PTE pages), but for - * 4K page we need to extract a bit from the virtual address and - * insert it into the "PA52" bit of the RPN. - */ - rlwimi r15,r16,32-9,20,20 - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE for now is base ind page size always - * MAS 2 : Use defaults - * MAS 3+7 : Needs to be done - */ - ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) - -BEGIN_MMU_FTR_SECTION - srdi r16,r10,32 - mtspr SPRN_MAS3,r10 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -htw_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_EPILOG_SUCCESS - rfi - -htw_tlb_miss_fault: - /* We need to check if it was an instruction miss. We know this - * though because r14 would contain -1 - */ - cmpdi cr0,r14,-1 - beq 1f - mtspr SPRN_DEAR,r16 - mtspr SPRN_ESR,r14 - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - /* * This is the guts of "any" level TLB miss handler for kernel linear * mapping misses. We are entered with: @@ -1149,8 +692,8 @@ tlb_load_linear: * we only use 1G pages for now. That might have to be changed in a * final implementation, especially when dealing with hypervisors */ - ld r11,PACATOC(r13) - ld r11,linear_map_top@got(r11) + __LOAD_PACA_TOC(r11) + LOAD_REG_ADDR_ALTTOC(r11, r11, linear_map_top) ld r10,0(r11) tovirt(10,10) cmpld cr0,r16,r10 @@ -1175,13 +718,9 @@ tlb_load_linear: clrldi r10,r10,4 /* clear region bits */ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX -BEGIN_MMU_FTR_SECTION srdi r16,r10,32 mtspr SPRN_MAS3,r10 mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f2bf98bdcea2..603a0f652ba6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/of.h> +#include <linux/of_address.h> #include <linux/pfn.h> #include <linux/cpuset.h> #include <linux/node.h> @@ -26,7 +27,6 @@ #include <linux/slab.h> #include <asm/cputhreads.h> #include <asm/sparsemem.h> -#include <asm/prom.h> #include <asm/smp.h> #include <asm/topology.h> #include <asm/firmware.h> @@ -34,31 +34,35 @@ #include <asm/hvcall.h> #include <asm/setup.h> #include <asm/vdso.h> +#include <asm/vphn.h> #include <asm/drmem.h> static int numa_enabled = 1; static char *cmdline __initdata; -static int numa_debug; -#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } - int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; -struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); -EXPORT_SYMBOL(node_data); -static int min_common_depth; +static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; -static int form1_affinity; + +#define FORM0_AFFINITY 0 +#define FORM1_AFFINITY 1 +#define FORM2_AFFINITY 2 +static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { + [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } +}; +static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; /* * Allocate node_to_cpumask_map based on number of available nodes @@ -79,7 +83,7 @@ static void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %u nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, @@ -123,13 +127,13 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, cmdline = p; fake_nid++; *nid = fake_nid; - dbg("created new fake_node with id %d\n", fake_nid); + pr_debug("created new fake_node with id %d\n", fake_nid); return 1; } return 0; } -static void reset_numa_cpu_lookup_table(void) +static void __init reset_numa_cpu_lookup_table(void) { unsigned int cpu; @@ -137,33 +141,79 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - dbg("adding cpu %d to node %d\n", cpu, node); - - if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { + pr_debug("adding cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); + } } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - dbg("removing cpu %lu from node %d\n", cpu, node); - if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + pr_debug("removing cpu %lu from node %d\n", cpu, node); } else { - printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", - cpu, node); + pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) +{ + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; + + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; + + nid = of_read_number(&associativity[index], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; +} +/* + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. + */ +static int associativity_to_nid(const __be32 *associativity) +{ + int array_sz = of_read_number(associativity, 1); + + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); +} + +static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist; + int node1, node2; + + node1 = associativity_to_nid(cpu1_assoc); + node2 = associativity_to_nid(cpu2_assoc); + + dist = numa_distance_table[node1][node2]; + if (dist <= LOCAL_DISTANCE) + return 0; + else if (dist <= REMOTE_DISTANCE) + return 1; + else + return 2; +} + +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -179,6 +229,15 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) return dist; } +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + if (affinity_form == FORM1_AFFINITY) + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); +} + /* must hold reference to node during call */ static const __be32 *of_get_associativity(struct device_node *dev) { @@ -190,7 +249,9 @@ int __node_distance(int a, int b) int i; int distance = LOCAL_DISTANCE; - if (!form1_affinity) + if (affinity_form == FORM2_AFFINITY) + return numa_distance_table[a][b]; + else if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -205,52 +266,6 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); -static void initialize_distance_lookup_table(int nid, - const __be32 *associativity) -{ - int i; - - if (!form1_affinity) - return; - - for (i = 0; i < distance_ref_points_depth; i++) { - const __be32 *entry; - - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; - distance_lookup_table[nid][i] = of_read_number(entry, 1); - } -} - -/* - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA - * info is found. - */ -static int associativity_to_nid(const __be32 *associativity) -{ - int nid = NUMA_NO_NODE; - - if (!numa_enabled) - goto out; - - if (of_read_number(associativity, 1) >= min_common_depth) - nid = of_read_number(&associativity[min_common_depth], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - - if (nid > 0 && - of_read_number(associativity, 1) >= distance_ref_points_depth) { - /* - * Skip the length field and send start of associativity array - */ - initialize_distance_lookup_table(nid, associativity + 1); - } - -out: - return nid; -} - /* Returns the nid associated with the given device tree node, * or -1 if not found. */ @@ -284,10 +299,155 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); -static int __init find_min_common_depth(void) +static void __initialize_form1_numa_distance(const __be32 *associativity, + int max_array_sz) +{ + int i, nid; + + if (affinity_form != FORM1_AFFINITY) + return; + + nid = __associativity_to_nid(associativity, max_array_sz); + if (nid != NUMA_NO_NODE) { + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* + * broken hierarchy, return with broken distance table + */ + if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) + return; + + entry = &associativity[index]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } + } +} + +static void initialize_form1_numa_distance(const __be32 *associativity) +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + __initialize_form1_numa_distance(associativity + 1, array_sz); +} + +/* + * Used to update distance information w.r.t newly added node. + */ +void update_numa_distance(struct device_node *node) +{ + int nid; + + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { + const __be32 *associativity; + + associativity = of_get_associativity(node); + if (!associativity) + return; + + initialize_form1_numa_distance(associativity); + return; + } + + /* FORM2 affinity */ + nid = of_node_to_nid_single(node); + if (nid == NUMA_NO_NODE) + return; + + /* + * With FORM2 we expect NUMA distance of all possible NUMA + * nodes to be provided during boot. + */ + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); +} +EXPORT_SYMBOL_GPL(update_numa_distance); + +/* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} + */ +static void __init initialize_form2_numa_distance_lookup_table(void) { - int depth; + int i, j; struct device_node *root; + const __u8 *form2_distances; + const __be32 *numa_lookup_index; + int form2_distances_length; + int max_numa_index, distance_index; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); + max_numa_index = of_read_number(&numa_lookup_index[0], 1); + + /* first element of the array is the size and is encode-int */ + form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL); + form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1); + /* Skip the size which is encoded int */ + form2_distances += sizeof(__be32); + + pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n", + form2_distances_length, max_numa_index); + + for (i = 0; i < max_numa_index; i++) + /* +1 skip the max_numa_index in the property */ + numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); + + + if (form2_distances_length != max_numa_index * max_numa_index) { + WARN(1, "Wrong NUMA distance information\n"); + form2_distances = NULL; // don't use it + } + distance_index = 0; + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + int dist; + + if (form2_distances) + dist = form2_distances[distance_index++]; + else if (nodeA == nodeB) + dist = LOCAL_DISTANCE; + else + dist = REMOTE_DISTANCE; + numa_distance_table[nodeA][nodeB] = dist; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist); + } + } + + of_node_put(root); +} + +static int __init find_primary_domain_index(void) +{ + int index; + struct device_node *root; + + /* + * Check for which form of affinity. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { + pr_debug("Using form 2 affinity\n"); + affinity_form = FORM2_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + pr_debug("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; + } else + affinity_form = FORM0_AFFINITY; if (firmware_has_feature(FW_FEATURE_OPAL)) root = of_find_node_by_path("/ibm,opal"); @@ -313,42 +473,37 @@ static int __init find_min_common_depth(void) &distance_ref_points_depth); if (!distance_ref_points) { - dbg("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("ibm,associativity-reference-points not found.\n"); goto err; } distance_ref_points_depth /= sizeof(int); - - if (firmware_has_feature(FW_FEATURE_OPAL) || - firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { - dbg("Using form 1 affinity\n"); - form1_affinity = 1; - } - - if (form1_affinity) { - depth = of_read_number(distance_ref_points, 1); - } else { + if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { - printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + pr_warn("short ibm,associativity-reference-points\n"); goto err; } - depth = of_read_number(&distance_ref_points[1], 1); + index = of_read_number(&distance_ref_points[1], 1); + } else { + /* + * Both FORM1 and FORM2 affinity find the primary domain details + * at the same offset. + */ + index = of_read_number(distance_ref_points, 1); } - /* * Warn and cap if the hardware supports more than * MAX_DISTANCE_REF_POINTS domains. */ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { - printk(KERN_WARNING "NUMA: distance array capped at " - "%d entries\n", MAX_DISTANCE_REF_POINTS); + pr_warn("distance array capped at %d entries\n", + MAX_DISTANCE_REF_POINTS); distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } of_node_put(root); - return depth; + return index; err: of_node_put(root); @@ -426,6 +581,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) return 0; } +static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb) +{ + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; + int nid = default_nid; + int rc, index; + + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + if (nid > 0 && affinity_form == FORM1_AFFINITY) { + /* + * lookup array associativity entries have + * no length of the array as the first element. + */ + __initialize_form1_numa_distance(associativity, aa.array_sz); + } + } + return nid; +} + /* * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. @@ -437,35 +624,28 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) int nid = default_nid; int rc, index; - if ((min_common_depth < 0) || !numa_enabled) + if ((primary_domain_index < 0) || !numa_enabled) return default_nid; rc = of_get_assoc_arrays(&aa); if (rc) return default_nid; - if (min_common_depth <= aa.array_sz && + if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + min_common_depth - 1; - nid = of_read_number(&aa.arrays[index], 1); - - if (nid == 0xffff || nid >= nr_node_ids) - nid = default_nid; + const __be32 *associativity; - if (nid > 0) { - index = lmb->aa_index * aa.array_sz; - initialize_distance_lookup_table(nid, - &aa.arrays[index]); - } + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); } - return nid; } #ifdef CONFIG_PPC_SPLPAR -static int vphn_get_nid(long lcpu) + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) { - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; long rc, hwid; /* @@ -485,12 +665,30 @@ static int vphn_get_nid(long lcpu) rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); if (rc == H_SUCCESS) - return associativity_to_nid(associativity); + return 0; } + return -1; +} + +static int vphn_get_nid(long lcpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + + + if (!__vphn_get_associativity(lcpu, associativity)) + return associativity_to_nid(associativity); + return NUMA_NO_NODE; + } #else + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + return -1; +} + static int vphn_get_nid(long unused) { return NUMA_NO_NODE; @@ -598,9 +796,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu) static int ppc_numa_cpu_dead(unsigned int cpu) { -#ifdef CONFIG_HOTPLUG_CPU - unmap_cpu_from_node(cpu); -#endif return 0; } @@ -685,7 +880,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, size = read_n_cells(n_mem_size_cells, usm); } - nid = of_drconf_to_nid_single(lmb); + nid = get_nid_and_numa_distance(lmb); fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), &nid); node_set_online(nid); @@ -699,27 +894,34 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, static int __init parse_numa_properties(void) { - struct device_node *memory; + struct device_node *memory, *pci; int default_nid = 0; unsigned long i; + const __be32 *associativity; if (numa_enabled == 0) { - printk(KERN_WARNING "NUMA disabled by user\n"); + pr_warn("disabled by user\n"); return -1; } - min_common_depth = find_min_common_depth(); + primary_domain_index = find_primary_domain_index(); - if (min_common_depth < 0) { + if (primary_domain_index < 0) { /* - * if we fail to parse min_common_depth from device tree + * if we fail to parse primary_domain_index from device tree * mark the numa disabled, boot with numa disabled. */ numa_enabled = false; - return min_common_depth; + return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); + + /* + * If it is FORM2 initialize the distance table here. + */ + if (affinity_form == FORM2_AFFINITY) + initialize_form2_numa_distance_lookup_table(); /* * Even though we connect cpus to numa domains later in SMP @@ -727,22 +929,36 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; struct device_node *cpu; - int nid = vphn_get_nid(i); + int nid = NUMA_NO_NODE; - /* - * Don't fall back to default_nid yet -- we will plug - * cpus into nodes once the memory scan has discovered - * the topology. - */ - if (nid == NUMA_NO_NODE) { + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); + + if (__vphn_get_associativity(i, vphn_assoc) == 0) { + nid = associativity_to_nid(vphn_assoc); + initialize_form1_numa_distance(vphn_assoc); + } else { + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ cpu = of_get_cpu_node(i, NULL); BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); + + associativity = of_get_associativity(cpu); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } of_node_put(cpu); } - node_set_online(nid); + /* node_set_online() is an UB if 'nid' is negative */ + if (likely(nid >= 0)) + node_set_online(nid); } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); @@ -774,8 +990,11 @@ new_range: * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid_single(memory); - if (nid < 0) + associativity = of_get_associativity(memory); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } else nid = default_nid; fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); @@ -789,6 +1008,18 @@ new_range: goto new_range; } + for_each_node_by_name(pci, "pci") { + int nid = NUMA_NO_NODE; + + associativity = of_get_associativity(pci); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } + if (likely(nid >= 0) && !node_online(nid)) + node_set_online(nid); + } + /* * Now do the same thing for each MEMBLOCK listed in the * ibm,dynamic-memory property in the @@ -811,10 +1042,8 @@ static void __init setup_nonnuma(void) unsigned int nid = 0; int i; - printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_DEBUG "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); + pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); + pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { fake_numa_create_new_node(end_pfn, &nid); @@ -864,27 +1093,9 @@ void __init dump_numa_cpu_topology(void) static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) { u64 spanned_pages = end_pfn - start_pfn; - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); - u64 nd_pa; - void *nd; - int tnid; - - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) - panic("Cannot allocate %zu bytes for node %d data\n", - nd_size, nid); - - nd = __va(nd_pa); - - /* report and initialize */ - pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = nd; - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + + alloc_node_data(nid); + NODE_DATA(nid)->node_id = nid; NODE_DATA(nid)->node_start_pfn = start_pfn; NODE_DATA(nid)->node_spanned_pages = spanned_pages; @@ -892,8 +1103,8 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { - struct device_node *rtas; - const __be32 *domains; + struct device_node *rtas, *root; + const __be32 *domains = NULL; int prop_length, max_nodes; u32 i; @@ -909,9 +1120,16 @@ static void __init find_possible_nodes(void) * it doesn't exist, then fallback on ibm,max-associativity-domains. * Current denotes what the platform can support compared to max * which denotes what the Hypervisor can support. + * + * If the LPAR is migratable, new nodes might be activated after a LPM, + * so we should consider the max number in that case. */ - domains = of_get_property(rtas, "ibm,current-associativity-domains", - &prop_length); + root = of_find_node_by_path("/"); + if (!of_get_property(root, "ibm,migratable-partition", NULL)) + domains = of_get_property(rtas, + "ibm,current-associativity-domains", + &prop_length); + of_node_put(root); if (!domains) { domains = of_get_property(rtas, "ibm,max-associativity-domains", &prop_length); @@ -919,14 +1137,16 @@ static void __init find_possible_nodes(void) goto out; } - max_nodes = of_read_number(&domains[min_common_depth], 1); + max_nodes = of_read_number(&domains[primary_domain_index], 1); + pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); + for (i = 0; i < max_nodes; i++) { if (!node_possible(i)) node_set(i, node_possible_map); } prop_length /= sizeof(int); - if (prop_length > min_common_depth + 2) + if (prop_length > primary_domain_index + 2) coregroup_enabled = 1; out: @@ -937,6 +1157,9 @@ void __init mem_topology_setup(void) { int cpu; + max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + min_low_pfn = MEMORY_START >> PAGE_SHIFT; + /* * Linux/mm assumes node 0 to be online at boot. However this is not * true on PowerPC, where node 0 is similar to any other node, it @@ -981,9 +1204,6 @@ void __init initmem_init(void) { int nid; - max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; - max_pfn = max_low_pfn; - memblock_dump_all(); for_each_online_node(nid) { @@ -1014,9 +1234,6 @@ static int __init early_numa(char *p) if (strstr(p, "off")) numa_enabled = 0; - if (strstr(p, "debug")) - numa_debug = 1; - p = strstr(p, "fake="); if (p) cmdline = p + strlen("fake="); @@ -1068,23 +1285,15 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr) int nid = NUMA_NO_NODE; for_each_node_by_type(memory, "memory") { - unsigned long start, size; - int ranges; - const __be32 *memcell_buf; - unsigned int len; - - memcell_buf = of_get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; + int i = 0; - /* ranges in cell */ - ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); + while (1) { + struct resource res; - while (ranges--) { - start = read_n_cells(n_mem_addr_cells, &memcell_buf); - size = read_n_cells(n_mem_size_cells, &memcell_buf); + if (of_address_to_resource(memory, i++, &res)) + break; - if ((scn_addr < start) || (scn_addr >= (start + size))) + if ((scn_addr < res.start) || (scn_addr > res.end)) continue; nid = of_node_to_nid_single(memory); @@ -1127,7 +1336,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr) return nid; } -static u64 hot_add_drconf_memory_max(void) +u64 hot_add_drconf_memory_max(void) { struct device_node *memory = NULL; struct device_node *dn = NULL; @@ -1179,7 +1388,7 @@ static long vphn_get_associativity(unsigned long cpu, switch (rc) { case H_SUCCESS: - dbg("VPHN hcall succeeded. Reset polling...\n"); + pr_debug("VPHN hcall succeeded. Reset polling...\n"); goto out; case H_FUNCTION: @@ -1202,43 +1411,26 @@ out: return rc; } -int find_and_online_cpu_nid(int cpu) +void find_and_update_cpu_nid(int cpu) { __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; int new_nid; /* Use associativity from first thread for all siblings */ if (vphn_get_associativity(cpu, associativity)) - return cpu_to_node(cpu); + return; + /* Do not have previous associativity, so find it now. */ new_nid = associativity_to_nid(associativity); - if (new_nid < 0 || !node_possible(new_nid)) - new_nid = first_online_node; - if (NODE_DATA(new_nid) == NULL) { -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Need to ensure that NODE_DATA is initialized for a node from - * available memory (see memblock_alloc_try_nid). If unable to - * init the node, then default to nearest node that has memory - * installed. Skip onlining a node if the subsystems are not - * yet initialized. - */ - if (!topology_inited || try_online_node(new_nid)) - new_nid = first_online_node; -#else - /* - * Default to using the nearest node that has memory installed. - * Otherwise, it would be necessary to patch the kernel MM code - * to deal with more memoryless-node error conditions. - */ + if (new_nid < 0 || !node_possible(new_nid)) new_nid = first_online_node; -#endif - } + else + // Associate node <-> cpu, so cpu_up() calls + // try_online_node() on the right node. + set_cpu_numa_node(cpu, new_nid); - pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__, - cpu, new_nid); - return new_nid; + pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid); } int cpu_to_coregroup_id(int cpu) @@ -1259,7 +1451,7 @@ int cpu_to_coregroup_id(int cpu) goto out; index = of_read_number(associativity, 1); - if (index > min_common_depth + 1) + if (index > primary_domain_index + 1) return of_read_number(&associativity[index - 1], 1); out: diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c index 0876216ceee6..ac22bf28086f 100644 --- a/arch/powerpc/mm/pageattr.c +++ b/arch/powerpc/mm/pageattr.c @@ -14,57 +14,60 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <mm/mmu_decl.h> + +static pte_basic_t pte_update_delta(pte_t *ptep, unsigned long addr, + unsigned long old, unsigned long new) +{ + return pte_update(&init_mm, addr, ptep, old & ~new, new & ~old, 0); +} /* - * Updates the attributes of a page in three steps: - * - * 1. invalidate the page table entry - * 2. flush the TLB - * 3. install the new entry with the updated attributes - * - * Invalidating the pte means there are situations where this will not work - * when in theory it should. - * For example: - * - removing write from page whilst it is being executed - * - setting a page read-only whilst it is being read by another CPU + * Updates the attributes of a page atomically. * + * This sequence is safe against concurrent updates, and also allows updating the + * attributes of a page currently being executed or accessed. */ static int change_page_attr(pte_t *ptep, unsigned long addr, void *data) { long action = (long)data; - pte_t pte; - - spin_lock(&init_mm.page_table_lock); - - /* invalidate the PTE so it's safe to modify */ - pte = ptep_get_and_clear(&init_mm, addr, ptep); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - /* modify the PTE bits as desired, then apply */ + addr &= PAGE_MASK; + /* modify the PTE bits as desired */ switch (action) { case SET_MEMORY_RO: - pte = pte_wrprotect(pte); + /* Don't clear DIRTY bit */ + pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_RO); + break; + case SET_MEMORY_ROX: + /* Don't clear DIRTY bit */ + pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_ROX); break; case SET_MEMORY_RW: - pte = pte_mkwrite(pte_mkdirty(pte)); + pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_RW); break; case SET_MEMORY_NX: - pte = pte_exprotect(pte); + pte_update_delta(ptep, addr, _PAGE_KERNEL_ROX, _PAGE_KERNEL_RO); break; case SET_MEMORY_X: - pte = pte_mkexec(pte); + pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_ROX); + break; + case SET_MEMORY_NP: + pte_update(&init_mm, addr, ptep, _PAGE_PRESENT, 0, 0); + break; + case SET_MEMORY_P: + pte_update(&init_mm, addr, ptep, 0, _PAGE_PRESENT, 0); break; default: WARN_ON_ONCE(1); break; } - set_pte_at(&init_mm, addr, ptep, pte); - /* See ptesync comment in radix__set_pte_at() */ if (radix_enabled()) asm volatile("ptesync": : :"memory"); - spin_unlock(&init_mm.page_table_lock); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); return 0; } @@ -100,35 +103,25 @@ int change_memory_attr(unsigned long addr, int numpages, long action) change_page_attr, (void *)action); } -/* - * Set the attributes of a page: - * - * This function is used by PPC32 at the end of init to set final kernel memory - * protection. It includes changing the maping of the page it is executing from - * and data pages it is using. - */ -static int set_page_attr(pte_t *ptep, unsigned long addr, void *data) +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +#ifdef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) { - pgprot_t prot = __pgprot((unsigned long)data); - - spin_lock(&init_mm.page_table_lock); - - set_pte_at(&init_mm, addr, ptep, pte_modify(*ptep, prot)); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - - spin_unlock(&init_mm.page_table_lock); - - return 0; -} + int err; + unsigned long addr = (unsigned long)page_address(page); -int set_memory_attr(unsigned long addr, int numpages, pgprot_t prot) -{ - unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); - unsigned long sz = numpages * PAGE_SIZE; + if (PageHighMem(page)) + return; - if (numpages <= 0) - return 0; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled()) + err = hash__kernel_map_pages(page, numpages, enable); + else if (enable) + err = set_memory_p(addr, numpages); + else + err = set_memory_np(addr, numpages); - return apply_to_existing_page_range(&init_mm, start, sz, set_page_attr, - (void *)pgprot_val(prot)); + if (err) + panic("%s: changing memory protections failed\n", __func__); } +#endif +#endif diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 97ae4935da79..77e55eac16e4 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -18,15 +18,15 @@ void pte_frag_destroy(void *pte_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pte_frag); + ptdesc = virt_to_ptdesc(pte_frag); /* drop all the pending references */ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pte_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -55,25 +55,23 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page; - - if (!kernel) { - page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); - if (!page) - return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); - return NULL; - } - } else { - page = alloc_page(PGALLOC_GFP); - if (!page) - return NULL; + struct ptdesc *ptdesc; + gfp_t gfp = PGALLOC_GFP; + + if (!kernel) + gfp |= __GFP_ACCOUNT; + + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -82,12 +80,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) return ret; spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return - * the allocated page with single fragement + * If we find ptdesc_page set, we return + * the allocated page with single fragment * count. */ if (likely(!pte_frag_get(&mm->context))) { - atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR); pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE); } spin_unlock(&mm->page_table_lock); @@ -106,17 +104,38 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) return __alloc_for_ptecache(mm, kernel); } +static void pte_free_now(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + void pte_fragment_free(unsigned long *table, int kernel) { - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - if (!kernel) - pgtable_pte_page_dtor(page); - __free_page(page); + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc))) + pte_free_now(&ptdesc->pt_rcu_head); + else + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); } } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct folio *folio; + + folio = virt_to_folio(pgtable); + folio_set_active(folio); + pte_fragment_free((unsigned long *)pgtable, 0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index cd16b407f47e..56d7e8960e77 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -46,19 +46,19 @@ static inline int is_exec_fault(void) * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ -static inline int pte_looks_normal(pte_t pte) +static inline int pte_looks_normal(pte_t pte, unsigned long addr) { if (pte_present(pte) && !pte_special(pte)) { if (pte_ci(pte)) return 0; - if (pte_user(pte)) + if (!is_kernel_addr(addr)) return 1; } return 0; } -static struct page *maybe_pte_to_page(pte_t pte) +static struct folio *maybe_pte_to_folio(pte_t pte) { unsigned long pfn = pte_pfn(pte); struct page *page; @@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte) page = pfn_to_page(pfn); if (PageReserved(page)) return NULL; - return page; + return page_folio(page); } #ifdef CONFIG_PPC_BOOK3S @@ -79,20 +79,17 @@ static struct page *maybe_pte_to_page(pte_t pte) * support falls into the same category. */ -static pte_t set_pte_filter_hash(pte_t pte) +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { - if (radix_enabled()) - return pte; - pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE))) { - struct page *pg = maybe_pte_to_page(pte); - if (!pg) + if (pte_looks_normal(pte, addr) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct folio *folio = maybe_pte_to_folio(pte); + if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &pg->flags)) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); } } return pte; @@ -100,38 +97,43 @@ static pte_t set_pte_filter_hash(pte_t pte) #else /* CONFIG_PPC_BOOK3S */ -static pte_t set_pte_filter_hash(pte_t pte) { return pte; } +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { return pte; } #endif /* CONFIG_PPC_BOOK3S */ /* Embedded type MMU with HW exec support. This is a bit more complicated * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so * instead we "filter out" the exec permission for non clean pages. + * + * This is also called once for the folio. So only work with folio->flags here. */ -static inline pte_t set_pte_filter(pte_t pte) +static inline pte_t set_pte_filter(pte_t pte, unsigned long addr) { - struct page *pg; + struct folio *folio; + + if (radix_enabled()) + return pte; if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return set_pte_filter_hash(pte); + return set_pte_filter_hash(pte, addr); /* No exec permission in the first place, move on */ - if (!pte_exec(pte) || !pte_looks_normal(pte)) + if (!pte_exec(pte) || !pte_looks_normal(pte, addr)) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); return pte; } @@ -142,7 +144,10 @@ static inline pte_t set_pte_filter(pte_t pte) static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, int dirty) { - struct page *pg; + struct folio *folio; + + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return pte; if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) return pte; @@ -165,17 +170,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, #endif /* CONFIG_DEBUG_VM */ /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) goto bail; /* Clean the page and set PG_dcache_clean */ - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); bail: return pte_mkexec(pte); @@ -184,23 +189,48 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { - /* - * Make sure hardware valid bit is not set. We don't do - * tlb flush for this update. - */ - VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this - * is called. + * is called. Filter the pte value and use the filtered value + * to setup all the ptes in the range. + */ + pte = set_pte_filter(pte, addr); + + /* + * We don't need to call arch_enter/leave_lazy_mmu_mode() + * because we expect set_ptes to be only be used on not present + * and not hw_valid ptes. Hence there is no translation cache flush + * involved that need to be batched. */ - pte = set_pte_filter(pte); + for (;;) { - /* Perform the setting of the PTE */ - __set_pte_at(mm, addr, ptep, pte, 0); + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + /* Perform the setting of the PTE */ + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + pte = pte_next_pfn(pte); + } +} + +void unmap_kernel_page(unsigned long va) +{ + pmd_t *pmdp = pmd_off_k(va); + pte_t *ptep = pte_offset_kernel(pmdp, va); + + pte_clear(&init_mm, va, ptep); + flush_tlb_kernel_range(va, va + PAGE_SIZE); } /* @@ -267,11 +297,15 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #if defined(CONFIG_PPC_8xx) -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) + +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) || defined(CONFIG_SPLIT_PMD_PTLOCKS) +/* We need the same lock to protect the PMD table and the two PTE tables. */ +#error "8M hugetlb folios are incompatible with split page table locks" +#endif + +static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val) { - pmd_t *pmd = pmd_off(mm, addr); - pte_basic_t val; - pte_basic_t *entry = &ptep->pte; + pte_basic_t *entry = (pte_basic_t *)ptep; int num, i; /* @@ -280,15 +314,60 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = set_pte_filter(pte); - - val = pte_val(pte); - num = number_of_cells_per_pte(pmd, val, 1); for (i = 0; i < num; i++, entry++, val += SZ_4K) *entry = val; } + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) +{ + pmd_t *pmdp = pmd_off(mm, addr); + + pte = set_pte_filter(pte, addr); + + if (sz == SZ_8M) { /* Flag both PMD entries as 8M and fill both page tables */ + *pmdp = __pmd(pmd_val(*pmdp) | _PMD_PAGE_8M); + *(pmdp + 1) = __pmd(pmd_val(*(pmdp + 1)) | _PMD_PAGE_8M); + + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte)); + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M); + } else { + __set_huge_pte_at(pmdp, ptep, pte_val(pte)); + } +} +#else +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) +{ + unsigned long pdsize; + int i; + + pte = set_pte_filter(pte, addr); + + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + if (sz < PMD_SIZE) + pdsize = PAGE_SIZE; + else if (sz < PUD_SIZE) + pdsize = PMD_SIZE; + else if (sz < P4D_SIZE) + pdsize = PUD_SIZE; + else if (sz < PGDIR_SIZE) + pdsize = P4D_SIZE; + else + pdsize = PGDIR_SIZE; + + for (i = 0; i < sz / pdsize; i++, ptep++, addr += pdsize) { + __set_pte_at(mm, addr, ptep, pte, 0); + pte = __pte(pte_val(pte) + ((unsigned long long)pdsize / PAGE_SIZE << PFN_PTE_SHIFT)); + } +} #endif #endif /* CONFIG_HUGETLB_PAGE */ @@ -299,6 +378,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; if (mm == &init_mm) return; @@ -317,8 +398,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - BUG_ON(!pmd_present(*pmd)); - assert_spin_locked(pte_lockptr(mm, pmd)); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); + BUG_ON(!pte); + assert_spin_locked(ptl); + pte_unmap(pte); } #endif /* CONFIG_DEBUG_VM */ @@ -332,14 +415,13 @@ unsigned long vmalloc_to_phys(void *va) EXPORT_SYMBOL_GPL(vmalloc_to_phys); /* - * We have 4 cases for pgds and pmds: + * We have 3 cases for pgds and pmds: * (1) invalid (all zeroes) * (2) pointer to next table, as normal; bottom 6 bits == 0 * (3) leaf pte for huge page _PAGE_PTE set - * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table * * So long as we atomically load page table pointers we are safe against teardown, - * we can follow the address down to the the page and take a ref on it. + * we can follow the address down to the page and take a ref on it. * This function need to be called with interrupts disabled. We use this variant * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED */ @@ -347,11 +429,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *hpage_shift) { pgd_t *pgdp; +#ifdef CONFIG_PPC64 p4d_t p4d, *p4dp; pud_t pud, *pudp; +#endif pmd_t pmd, *pmdp; pte_t *ret_pte; - hugepd_t *hpdp = NULL; unsigned pdshift; if (hpage_shift) @@ -366,8 +449,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, * page fault or a page unmap. The return pte_t * is still not * stable. So should be checked there for above conditions. * Top level is an exception because it is folded into p4d. + * + * On PPC32, P4D/PUD/PMD are folded into PGD so go straight to + * PMD level. */ pgdp = pgdir + pgd_index(ea); +#ifdef CONFIG_PPC64 p4dp = p4d_offset(pgdp, ea); p4d = READ_ONCE(*p4dp); pdshift = P4D_SHIFT; @@ -375,16 +462,11 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (p4d_none(p4d)) return NULL; - if (p4d_is_leaf(p4d)) { + if (p4d_leaf(p4d)) { ret_pte = (pte_t *)p4dp; goto out; } - if (is_hugepd(__hugepd(p4d_val(p4d)))) { - hpdp = (hugepd_t *)&p4d; - goto out_huge; - } - /* * Even if we end up with an unmap, the pgtable will not * be freed, because we do an rcu free and here we are @@ -397,18 +479,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (pud_none(pud)) return NULL; - if (pud_is_leaf(pud)) { + if (pud_leaf(pud)) { ret_pte = (pte_t *)pudp; goto out; } - if (is_hugepd(__hugepd(pud_val(pud)))) { - hpdp = (hugepd_t *)&pud; - goto out_huge; - } - - pdshift = PMD_SHIFT; pmdp = pmd_offset(&pud, ea); +#else + pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea); +#endif + pdshift = PMD_SHIFT; pmd = READ_ONCE(*pmdp); /* @@ -429,34 +509,47 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, return NULL; #endif - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (pmd_trans_huge(pmd)) { if (is_thp) *is_thp = true; ret_pte = (pte_t *)pmdp; goto out; } - if (pmd_is_leaf(pmd)) { + if (pmd_leaf(pmd)) { ret_pte = (pte_t *)pmdp; goto out; } - if (is_hugepd(__hugepd(pmd_val(pmd)))) { - hpdp = (hugepd_t *)&pmd; - goto out_huge; - } - return pte_offset_kernel(&pmd, ea); -out_huge: - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(*hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); out: if (hpage_shift) *hpage_shift = pdshift; return ret_pte; } EXPORT_SYMBOL_GPL(__find_linux_pte); + +/* Note due to the way vm flags are laid out, the bits are XWR */ +const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_EXECONLY_X, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY_X, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_EXECONLY_X, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; + +#ifndef CONFIG_PPC_BOOK3S_64 +DECLARE_VM_GET_PAGE_PROT +#endif diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index dcf5ecca19d9..0c9ef705803e 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -33,8 +33,6 @@ #include <mm/mmu_decl.h> -extern char etext[], _stext[], _sinittext[], _einittext[]; - static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data; notrace void __init early_ioremap_init(void) @@ -50,15 +48,10 @@ notrace void __init early_ioremap_init(void) early_ioremap_setup(); } -static void __init *early_alloc_pgtable(unsigned long size) +void __init *early_alloc_pgtable(unsigned long size) { - void *ptr = memblock_alloc(size, size); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, size); + return memblock_alloc_or_panic(size, size); - return ptr; } pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) @@ -104,15 +97,14 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { unsigned long v, s; phys_addr_t p; - int ktext; + bool ktext; s = offset; v = PAGE_OFFSET + s; p = memstart_addr + s; for (; s < top; s += PAGE_SIZE) { - ktext = ((char *)v >= _stext && (char *)v < etext) || - ((char *)v >= _sinittext && (char *)v < _einittext); - map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); + ktext = core_kernel_text(v); + map_kernel_page(v, p, ktext ? PAGE_KERNEL_X : PAGE_KERNEL); v += PAGE_SIZE; p += PAGE_SIZE; } @@ -133,57 +125,58 @@ void __init mapin_ram(void) } } -void mark_initmem_nx(void) +static int __mark_initmem_nx(void) { unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); + int err; - if (v_block_mapped((unsigned long)_sinittext)) - mmu_mark_initmem_nx(); - else - set_memory_attr((unsigned long)_sinittext, numpages, PAGE_KERNEL); + err = mmu_mark_initmem_nx(); + + if (!v_block_mapped((unsigned long)_sinittext)) { + err = set_memory_nx((unsigned long)_sinittext, numpages); + if (err) + return err; + err = set_memory_rw((unsigned long)_sinittext, numpages); + } + return err; +} + +void mark_initmem_nx(void) +{ + int err = __mark_initmem_nx(); + + if (err) + panic("%s() failed, err = %d\n", __func__, err); } #ifdef CONFIG_STRICT_KERNEL_RWX -void mark_rodata_ro(void) +static int __mark_rodata_ro(void) { unsigned long numpages; - if (v_block_mapped((unsigned long)_stext + 1)) { - mmu_mark_rodata_ro(); - ptdump_check_wx(); - return; - } + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && mmu_has_feature(MMU_FTR_HPTE_TABLE)) + pr_warn("This platform has HASH MMU, STRICT_MODULE_RWX won't work\n"); - numpages = PFN_UP((unsigned long)_etext) - - PFN_DOWN((unsigned long)_stext); + if (v_block_mapped((unsigned long)_stext + 1)) + return mmu_mark_rodata_ro(); - set_memory_attr((unsigned long)_stext, numpages, PAGE_KERNEL_ROX); /* - * mark .rodata as read only. Use __init_begin rather than __end_rodata - * to cover NOTES and EXCEPTION_TABLE. + * mark text and rodata as read only. __end_rodata is set by + * powerpc's linker script and includes tables and data + * requiring relocation which are not put in RO_DATA. */ - numpages = PFN_UP((unsigned long)__init_begin) - - PFN_DOWN((unsigned long)__start_rodata); - - set_memory_attr((unsigned long)__start_rodata, numpages, PAGE_KERNEL_RO); + numpages = PFN_UP((unsigned long)__end_rodata) - + PFN_DOWN((unsigned long)_stext); - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); + return set_memory_ro((unsigned long)_stext, numpages); } -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC -void __kernel_map_pages(struct page *page, int numpages, int enable) +void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)page_address(page); - - if (PageHighMem(page)) - return; + int err = __mark_rodata_ro(); - if (enable) - set_memory_attr(addr, numpages, PAGE_KERNEL); - else - set_memory_attr(addr, numpages, __pgprot(0)); + if (err) + panic("%s() failed, err = %d\n", __func__, err); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 78c8cf01db5f..6621cfc3baf8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -32,7 +32,6 @@ #include <linux/hugetlb.h> #include <asm/page.h> -#include <asm/prom.h> #include <asm/mmu_context.h> #include <asm/mmu.h> #include <asm/smp.h> @@ -101,8 +100,9 @@ EXPORT_SYMBOL(__pte_frag_size_shift); /* 4 level page table */ struct page *p4d_page(p4d_t p4d) { - if (p4d_is_leaf(p4d)) { - VM_WARN_ON(!p4d_huge(p4d)); + if (p4d_leaf(p4d)) { + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!p4d_leaf(p4d)); return pte_page(p4d_pte(p4d)); } return virt_to_page(p4d_pgtable(p4d)); @@ -111,8 +111,9 @@ struct page *p4d_page(p4d_t p4d) struct page *pud_page(pud_t pud) { - if (pud_is_leaf(pud)) { - VM_WARN_ON(!pud_huge(pud)); + if (pud_leaf(pud)) { + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!pud_leaf(pud)); return pte_page(pud_pte(pud)); } return virt_to_page(pud_pgtable(pud)); @@ -124,8 +125,14 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_is_leaf(pmd)) { - VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); + if (pmd_leaf(pmd)) { + /* + * vmalloc_to_page may be called on any vmap address (not only + * vmalloc), and it uses pmd_page() etc., when huge vmap is + * enabled so these checks can't be used. + */ + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!pmd_leaf(pmd)); return pte_page(pmd_pte(pmd)); } return virt_to_page(pmd_page_vaddr(pmd)); @@ -143,9 +150,6 @@ void mark_rodata_ro(void) radix__mark_rodata_ro(); else hash__mark_rodata_ro(); - - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); } void mark_initmem_nx(void) diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 86da2a669680..ff845f251724 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -21,11 +21,6 @@ static const struct flag_info flag_array[] = { .set = "huge", .clear = " ", }, { - .mask = _PAGE_SH, - .val = 0, - .set = "user", - .clear = " ", - }, { .mask = _PAGE_RO | _PAGE_NA, .val = 0, .set = "rw", @@ -74,18 +69,25 @@ static const struct flag_info flag_array[] = { } }; -struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ +struct ptdump_pg_level pg_level[5] = { + { /* pgd */ + .name = "PGD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ + .name = "P4D", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ + .name = "PUD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pmd */ + .name = "PMD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pte */ + .name = "PTE", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index 712762be3cb1..0f7a050f327e 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -2,8 +2,13 @@ obj-y += ptdump.o -obj-$(CONFIG_4xx) += shared.o +obj-$(CONFIG_44x) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o -obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o -obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o +obj-$(CONFIG_PPC_E500) += shared.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o + +ifdef CONFIG_PTDUMP_DEBUGFS +obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o +obj-$(CONFIG_PPC_64S_HASH_MMU) += hashpagetable.o +endif diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index c4c628b03cf8..820c119013e4 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -7,7 +7,7 @@ */ #include <linux/pgtable.h> -#include <asm/debugfs.h> +#include <linux/debugfs.h> #include <asm/cpu_has_feature.h> #include "ptdump.h" @@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d) -static int bats_show_603(struct seq_file *m, void *v) +static int bats_show(struct seq_file *m, void *v) { seq_puts(m, "---[ Instruction Block Address Translation ]---\n"); @@ -88,22 +88,12 @@ static int bats_show_603(struct seq_file *m, void *v) return 0; } -static int bats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bats_show_603, NULL); -} - -static const struct file_operations bats_fops = { - .open = bats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bats); static int __init bats_init(void) { debugfs_create_file("block_address_translation", 0400, - powerpc_debugfs_root, NULL, &bats_fops); + arch_debugfs_dir, NULL, &bats_fops); return 0; } device_initcall(bats_init); diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index 14f73868db66..e8a21c6dc32e 100644 --- a/arch/powerpc/mm/ptdump/book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -102,18 +102,25 @@ static const struct flag_info flag_array[] = { } }; -struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ +struct ptdump_pg_level pg_level[5] = { + { /* pgd */ + .name = "PGD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ + .name = "P4D", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ + .name = "PUD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pmd */ + .name = "PMD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pte */ + .name = "PTE", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index ad6df9a2e7c8..671d0dc00c6d 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -216,6 +216,8 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 vpn = hpt_vpn(ea, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); want_v = hpte_encode_avpn(vpn, psize, ssize); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + want_v = hpte_old_to_new_v(want_v); /* to check in the secondary hash table, we invert the hash */ if (!primary) @@ -229,6 +231,10 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 /* HPTE matches */ *v = be64_to_cpu(hptep->v); *r = be64_to_cpu(hptep->r); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + *v = hpte_new_to_old_v(*v, *r); + *r = hpte_new_to_old_r(*r); + } return 0; } ++hpte_group; @@ -238,7 +244,10 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { - struct hash_pte ptes[4]; + struct { + unsigned long v; + unsigned long r; + } ptes[4]; unsigned long vsid, vpn, hash, hpte_group, want_v; int i, j, ssize = mmu_kernel_ssize; long lpar_rc = 0; @@ -488,7 +497,7 @@ static void walk_vmemmap(struct pg_state *st) * Traverse the vmemmaped memory and dump pages that are in the hash * pagetable. */ - while (ptr->list) { + while (ptr) { hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize); ptr = ptr->list; } @@ -526,17 +535,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static int ptdump_init(void) { diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 5062c58b1e5b..0d499aebee72 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -16,10 +16,12 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/ptdump.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <asm/fixmap.h> #include <linux/const.h> +#include <linux/kasan.h> #include <asm/page.h> #include <asm/hugetlb.h> @@ -54,11 +56,12 @@ * */ struct pg_state { + struct ptdump_state ptdump; struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; unsigned long start_pa; - unsigned int level; + int level; u64 current_flags; bool check_wx; unsigned long wx_pages; @@ -102,6 +105,11 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; +static struct ptdump_range ptdump_range[] __ro_after_init = { + {TASK_SIZE_MAX, ~0UL}, + {0, 0} +}; + #define pt_dump_seq_printf(m, fmt, args...) \ ({ \ if (m) \ @@ -116,7 +124,7 @@ static struct addr_marker address_markers[] = { void pt_dump_size(struct seq_file *m, unsigned long size) { - static const char units[] = "KMGTPE"; + static const char units[] = " KMGTPE"; const char *unit = units; /* Work out what appropriate unit to use */ @@ -169,29 +177,30 @@ static void dump_addr(struct pg_state *st, unsigned long addr) pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); - pt_dump_size(st->seq, (addr - st->start_address) >> 10); + pt_dump_size(st->seq, addr - st->start_address); + pt_dump_seq_printf(st->seq, "%s ", pg_level[st->level].name); } static void note_prot_wx(struct pg_state *st, unsigned long addr) { pte_t pte = __pte(st->current_flags); - if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx) + if (!st->check_wx) return; if (!pte_write(pte) || !pte_exec(pte)) return; - WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", (void *)st->start_address, (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) +static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; u64 pa = val & PTE_RPN_MASK; st->level = level; @@ -205,15 +214,15 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, } } -static void note_page(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); /* At first no level is set */ - if (!st->level) { + if (st->level == -1) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -242,95 +251,7 @@ static void note_page(struct pg_state *st, unsigned long addr, * Address indicates we have passed the end of the * current section of virtual memory */ - note_page_update_state(st, addr, level, val, page_size); - } -} - -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) -{ - pte_t *pte = pte_offset_kernel(pmd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE); - - } -} - -static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start, - int pdshift, int level) -{ -#ifdef CONFIG_ARCH_HAS_HUGEPD - unsigned int i; - int shift = hugepd_shift(*phpd); - int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1; - - if (start & ((1 << shift) - 1)) - return; - - for (i = 0; i < ptrs_per_hpd; i++) { - unsigned long addr = start + (i << shift); - pte_t *pte = hugepte_offset(*phpd, addr, pdshift); - - note_page(st, addr, level + 1, pte_val(*pte), 1 << shift); - } -#endif -} - -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd)) - /* pmd exists */ - walk_pte(st, pmd, addr); - else - note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE); - } -} - -static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) -{ - pud_t *pud = pud_offset(p4d, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - addr = start + i * PUD_SIZE; - if (!pud_none(*pud) && !pud_is_leaf(*pud)) - /* pud exists */ - walk_pmd(st, pud, addr); - else - note_page(st, addr, 2, pud_val(*pud), PUD_SIZE); - } -} - -static void walk_pagetables(struct pg_state *st) -{ - unsigned int i; - unsigned long addr = st->start_address & PGDIR_MASK; - pgd_t *pgd = pgd_offset_k(addr); - - /* - * Traverse the linux pagetable structure and dump pages that are in - * the hash pagetable. - */ - for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - p4d_t *p4d = p4d_offset(pgd, 0); - - if (p4d_none(*p4d) || p4d_is_leaf(*p4d)) - note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); - else if (is_hugepd(__hugepd(p4d_val(*p4d)))) - walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1); - else - /* p4d exists */ - walk_pud(st, p4d, addr); + note_page_update_state(st, addr, level, val); } } @@ -371,11 +292,43 @@ static void populate_markers(void) #endif address_markers[i++].start_address = FIXADDR_START; address_markers[i++].start_address = FIXADDR_TOP; +#endif /* CONFIG_PPC64 */ #ifdef CONFIG_KASAN address_markers[i++].start_address = KASAN_SHADOW_START; address_markers[i++].start_address = KASAN_SHADOW_END; #endif -#endif /* CONFIG_PPC64 */ +} + +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); } static int ptdump_show(struct seq_file *m, void *v) @@ -383,34 +336,26 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .level = -1, + .ptdump = { + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - /* Traverse kernel page tables */ - walk_pagetables(&st); - note_page(&st, 0, 0, 0, 0); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); return 0; } +DEFINE_SHOW_ATTRIBUTE(ptdump); -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void build_pgtable_complete_mask(void) +static void __init build_pgtable_complete_mask(void) { unsigned int i, j; @@ -420,37 +365,61 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_PPC_DEBUG_WX -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, - .marker = address_markers, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, + .level = -1, .check_wx = true, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO)) + return true; - walk_pagetables(&st); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } -#endif -static int ptdump_init(void) +static int __init ptdump_init(void) { +#ifdef CONFIG_PPC64 + if (!radix_enabled()) + ptdump_range[0].start = KERN_VIRT_START; + else + ptdump_range[0].start = PAGE_OFFSET; + + ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD); +#endif + populate_markers(); build_pgtable_complete_mask(); - debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, - &ptdump_fops); + + if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS)) + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + return 0; } device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h index 154efae96ae0..12aa9eca8b0c 100644 --- a/arch/powerpc/mm/ptdump/ptdump.h +++ b/arch/powerpc/mm/ptdump/ptdump.h @@ -11,12 +11,13 @@ struct flag_info { int shift; }; -struct pgtable_level { +struct ptdump_pg_level { const struct flag_info *flag; + char name[4]; size_t num; u64 mask; }; -extern struct pgtable_level pg_level[5]; +extern struct ptdump_pg_level pg_level[5]; void pt_dump_size(struct seq_file *m, unsigned long delta); diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 565048a0c9be..9df3af8d481f 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -6,7 +6,7 @@ * This dumps the content of Segment Registers */ -#include <asm/debugfs.h> +#include <linux/debugfs.h> static void seg_show(struct seq_file *m, int i) { @@ -41,21 +41,11 @@ static int sr_show(struct seq_file *m, void *v) return 0; } -static int sr_open(struct inode *inode, struct file *file) -{ - return single_open(file, sr_show, NULL); -} - -static const struct file_operations sr_fops = { - .open = sr_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(sr); static int __init sr_init(void) { - debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root, + debugfs_create_file("segment_registers", 0400, arch_debugfs_dir, NULL, &sr_fops); return 0; } diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index c005fe041c18..edc69da19b85 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -11,15 +11,15 @@ static const struct flag_info flag_array[] = { { - .mask = _PAGE_USER, - .val = _PAGE_USER, - .set = "user", - .clear = " ", + .mask = _PAGE_READ, + .val = 0, + .set = " ", + .clear = "r", }, { - .mask = _PAGE_RW, - .val = _PAGE_RW, - .set = "rw", - .clear = "r ", + .mask = _PAGE_WRITE, + .val = 0, + .set = " ", + .clear = "w", }, { .mask = _PAGE_EXEC, .val = _PAGE_EXEC, @@ -67,18 +67,25 @@ static const struct flag_info flag_array[] = { } }; -struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ +struct ptdump_pg_level pg_level[5] = { + { /* pgd */ + .name = "PGD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ + .name = "P4D", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ + .name = "PUD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pmd */ + .name = "PMD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pte */ + .name = "PTE", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, |
